Hi all, This is Linfeng Zhang from Google. I'll work on ARM NEON optimization in the next few months. I'm submitting 2 patches in the following couple of emails, which have the new created celt_fir_neon(). I revised celt_fir_c() to not pass in argument "mem" in Patch 1. If there are concerns to this change, please let me know. Many thanks to your comments. Linfeng Zhang
Linfeng Zhang
2016-Jun-17 21:09 UTC
[opus] [PATCH 1/2] Revise celt_fir_c() to not pass in argument "mem"
The "mem" in celt_fir_c() either is contained in the head of input "x" in reverse order already, or can be easily attached to the head of "x" before calling the function. Removing argument "mem" can eliminate the redundant buffer copies inside. Update celt_fir_sse4_1() accordingly. --- celt/celt_decoder.c | 10 ++++----- celt/celt_lpc.c | 33 +++++++++++------------------- celt/celt_lpc.h | 5 ++--- celt/x86/celt_lpc_sse.c | 51 +++++++++------------------------------------- celt/x86/celt_lpc_sse.h | 10 ++++----- celt/x86/x86_celt_map.c | 1 - silk/LPC_analysis_filter.c | 6 +----- 7 files changed, 34 insertions(+), 82 deletions(-) diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index b978bb3..f8433eb 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -509,7 +509,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val16 fade = Q15ONE; int pitch_index; VARDECL(opus_val32, etmp); - VARDECL(opus_val16, exc); + VARDECL(opus_val16, _exc); if (loss_count == 0) { @@ -520,7 +520,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } ALLOC(etmp, overlap, opus_val32); - ALLOC(exc, MAX_PERIOD, opus_val16); + ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16); + opus_val16 *exc = _exc+LPC_ORDER; window = mode->window; c=0; do { opus_val16 decay; @@ -568,15 +569,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) /* Initialize the LPC history with the samples just before the start of the region for which we're computing the excitation. */ { - opus_val16 lpc_mem[LPC_ORDER]; for (i=0;i<LPC_ORDER;i++) { - lpc_mem[i] + exc[MAX_PERIOD-exc_length-1-i] ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT); } /* Compute the excitation for exc_length samples before the loss. */ celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, - exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch); + exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, st->arch); } /* Check if the waveform is decaying, and if so how fast. diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index f02145a..80124df 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -90,56 +90,47 @@ int p void celt_fir_c( - const opus_val16 *_x, + const opus_val16 *x, const opus_val16 *num, - opus_val16 *_y, + opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch) { int i,j; VARDECL(opus_val16, rnum); - VARDECL(opus_val16, x); SAVE_STACK; ALLOC(rnum, ord, opus_val16); - ALLOC(x, N+ord, opus_val16); for(i=0;i<ord;i++) rnum[i] = num[ord-i-1]; - for(i=0;i<ord;i++) - x[i] = mem[ord-i-1]; - for (i=0;i<N;i++) - x[i+ord]=_x[i]; - for(i=0;i<ord;i++) - mem[i] = _x[N-i-1]; #ifdef SMALL_FOOTPRINT (void)arch; for (i=0;i<N;i++) { - opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT); for (j=0;j<ord;j++) { - sum = MAC16_16(sum,rnum[j],x[i+j]); + sum = MAC16_16(sum,rnum[j],x[i+j-ord]); } - _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); } #else for (i=0;i<N-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; - xcorr_kernel(rnum, x+i, sum, ord, arch); - _y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT))); - _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT))); - _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT))); - _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT))); + xcorr_kernel(rnum, x+i-ord, sum, ord, arch); + y[i ] = SATURATE16(ADD32(EXTEND32(x[i ]), PSHR32(sum[0], SIG_SHIFT))); + y[i+1] = SATURATE16(ADD32(EXTEND32(x[i+1]), PSHR32(sum[1], SIG_SHIFT))); + y[i+2] = SATURATE16(ADD32(EXTEND32(x[i+2]), PSHR32(sum[2], SIG_SHIFT))); + y[i+3] = SATURATE16(ADD32(EXTEND32(x[i+3]), PSHR32(sum[3], SIG_SHIFT))); } for (;i<N;i++) { opus_val32 sum = 0; for (j=0;j<ord;j++) - sum = MAC16_16(sum,rnum[j],x[i+j]); - _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); + sum = MAC16_16(sum,rnum[j],x[i+j-ord]); + y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT))); } #endif RESTORE_STACK; diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index 323459e..a4c5fd6 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -45,12 +45,11 @@ void celt_fir_c( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch); #if !defined(OVERRIDE_CELT_FIR) -#define celt_fir(x, num, y, N, ord, mem, arch) \ - (celt_fir_c(x, num, y, N, ord, mem, arch)) +#define celt_fir(x, num, y, N, ord, arch) \ + (celt_fir_c(x, num, y, N, ord, arch)) #endif void celt_iir(const opus_val32 *x, diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c index 67e5592..12a9b0e 100644 --- a/celt/x86/celt_lpc_sse.c +++ b/celt/x86/celt_lpc_sse.c @@ -40,63 +40,32 @@ #if defined(FIXED_POINT) -void celt_fir_sse4_1(const opus_val16 *_x, +void celt_fir_sse4_1(const opus_val16 *x, const opus_val16 *num, - opus_val16 *_y, + opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch) { int i,j; VARDECL(opus_val16, rnum); - VARDECL(opus_val16, x); __m128i vecNoA; opus_int32 noA ; SAVE_STACK; ALLOC(rnum, ord, opus_val16); - ALLOC(x, N+ord, opus_val16); for(i=0;i<ord;i++) rnum[i] = num[ord-i-1]; - for(i=0;i<ord;i++) - x[i] = mem[ord-i-1]; - - for (i=0;i<N-7;i+=8) - { - x[i+ord ]=_x[i ]; - x[i+ord+1]=_x[i+1]; - x[i+ord+2]=_x[i+2]; - x[i+ord+3]=_x[i+3]; - x[i+ord+4]=_x[i+4]; - x[i+ord+5]=_x[i+5]; - x[i+ord+6]=_x[i+6]; - x[i+ord+7]=_x[i+7]; - } - - for (;i<N-3;i+=4) - { - x[i+ord ]=_x[i ]; - x[i+ord+1]=_x[i+1]; - x[i+ord+2]=_x[i+2]; - x[i+ord+3]=_x[i+3]; - } - - for (;i<N;i++) - x[i+ord]=_x[i]; - - for(i=0;i<ord;i++) - mem[i] = _x[N-i-1]; #ifdef SMALL_FOOTPRINT for (i=0;i<N;i++) { - opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT); for (j=0;j<ord;j++) { - sum = MAC16_16(sum,rnum[j],x[i+j]); + sum = MAC16_16(sum,rnum[j],x[i+j-ord]); } - _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); } #else noA = EXTEND32(1) << SIG_SHIFT >> 1; @@ -107,22 +76,22 @@ void celt_fir_sse4_1(const opus_val16 *_x, opus_val32 sums[4] = {0}; __m128i vecSum, vecX; - xcorr_kernel(rnum, x+i, sums, ord, arch); + xcorr_kernel(rnum, x+i-ord, sums, ord, arch); vecSum = _mm_loadu_si128((__m128i *)sums); vecSum = _mm_add_epi32(vecSum, vecNoA); vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); - vecX = OP_CVTEPI16_EPI32_M64(_x + i); + vecX = OP_CVTEPI16_EPI32_M64(x + i); vecSum = _mm_add_epi32(vecSum, vecX); vecSum = _mm_packs_epi32(vecSum, vecSum); - _mm_storel_epi64((__m128i *)(_y + i), vecSum); + _mm_storel_epi64((__m128i *)(y + i), vecSum); } for (;i<N;i++) { opus_val32 sum = 0; for (j=0;j<ord;j++) - sum = MAC16_16(sum, rnum[j], x[i + j]); - _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); + sum = MAC16_16(sum, rnum[j], x[i+j-ord]); + y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT))); } #endif diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index c5ec796..7d1ecf7 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -41,12 +41,11 @@ void celt_fir_sse4_1( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) -#define celt_fir(x, num, y, N, ord, mem, arch) \ - ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) +#define celt_fir(x, num, y, N, ord, arch) \ + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) #else @@ -56,11 +55,10 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch); -# define celt_fir(x, num, y, N, ord, mem, arch) \ - ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch)) +# define celt_fir(x, num, y, N, ord, arch) \ + ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) #endif #endif diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 8e5e449..1331c10 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -46,7 +46,6 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch ) = { celt_fir_c, /* non-sse */ diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c index 2090667..5aeee4c 100644 --- a/silk/LPC_analysis_filter.c +++ b/silk/LPC_analysis_filter.c @@ -50,7 +50,6 @@ void silk_LPC_analysis_filter( { opus_int j; #ifdef FIXED_POINT - opus_int16 mem[SILK_MAX_ORDER_LPC]; opus_int16 num[SILK_MAX_ORDER_LPC]; #else int ix; @@ -67,10 +66,7 @@ void silk_LPC_analysis_filter( for ( j = 0; j < d; j++ ) { num[ j ] = -B[ j ]; } - for (j=0;j<d;j++) { - mem[ j ] = in[ d - j - 1 ]; - } - celt_fir( in + d, num, out + d, len - d, d, mem, arch ); + celt_fir( in + d, num, out + d, len - d, d, arch ); for ( j = 0; j < d; j++ ) { out[ j ] = 0; } -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Jun-17 21:09 UTC
[opus] [PATCH 2/2] Optimize fixed-point celt_fir_c() for ARM NEON
Create the fixed-point intrinsics optimization celt_fir_neon() for ARM NEON. Create test celt/tests/test_unit_optimization_lpc to unit test the optimization. --- .gitignore | 2 + Makefile.am | 17 ++- celt/arm/arm_celt_map.c | 17 +++ celt/arm/celt_lpc_arm.h | 65 ++++++++ celt/arm/celt_lpc_neon_intr.c | 254 ++++++++++++++++++++++++++++++++ celt/celt_lpc.h | 5 + celt/tests/test_unit_dft.c | 1 + celt/tests/test_unit_mathops.c | 1 + celt/tests/test_unit_mdct.c | 1 + celt/tests/test_unit_optimization_lpc.c | 121 +++++++++++++++ celt/tests/test_unit_rotation.c | 1 + celt_headers.mk | 1 + celt_sources.mk | 1 + 13 files changed, 482 insertions(+), 5 deletions(-) create mode 100644 celt/arm/celt_lpc_arm.h create mode 100644 celt/arm/celt_lpc_neon_intr.c create mode 100644 celt/tests/test_unit_optimization_lpc.c diff --git a/.gitignore b/.gitignore index 33127c9..6d3b48b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ install-sh .deps .libs .dirstamp +.project *.a *.exe *.la @@ -57,6 +58,7 @@ celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct +celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_types doc/doxygen_sqlite3.db diff --git a/Makefile.am b/Makefile.am index cfdaced..e06d687 100644 --- a/Makefile.am +++ b/Makefile.am @@ -83,9 +83,9 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_type noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD) if EXTRA_PROGRAMS -noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_types +noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_types -TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding +TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding opus_demo_SOURCES = src/opus_demo.c @@ -137,6 +137,12 @@ if OPUS_ARM_EXTERNAL_ASM celt_tests_test_unit_mdct_LDADD += libarmasm.la endif +celt_tests_test_unit_optimization_lpc_SOURCES = celt/tests/test_unit_optimization_lpc.c +celt_tests_test_unit_optimization_lpc_LDADD = $(NE10_LIBS) $(LIBM) +if OPUS_ARM_EXTERNAL_ASM +celt_tests_test_unit_optimization_lpc_LDADD += libarmasm.la +endif + celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c celt_tests_test_unit_rotation_LDADD = $(NE10_LIBS) $(LIBM) if OPUS_ARM_EXTERNAL_ASM @@ -272,10 +278,11 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl %-gnu.S: %.s $(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@ -OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \ - $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \ +OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_dft_SOURCES:.c=.o) \ + $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \ $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \ - $(celt_tests_test_unit_dft_SOURCES:.c=.o) + $(celt_tests_test_unit_optimization_lpc_SOURCES:.c=.o) \ + $(celt_tests_test_unit_rotation_SOURCES:.c=.o) if HAVE_SSE SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo) diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index ee6c244..4e90fde 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -29,6 +29,7 @@ #include "config.h" #endif +#include "celt_lpc.h" #include "pitch.h" #include "kiss_fft.h" #include "mdct.h" @@ -36,6 +37,22 @@ #if defined(OPUS_HAVE_RTCD) # if defined(FIXED_POINT) +void celt_fir_neon( + const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *_y, + int N, + int ord, + int arch); + +void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val16 *, int, int, int) = { + celt_fir_c, /* ARMv4 */ + celt_fir_c, /* EDSP */ + celt_fir_c, /* Media */ + MAY_HAVE_NEON(celt_fir) /* NEON */ +}; + opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *, opus_val32 *, int , int) = { celt_pitch_xcorr_c, /* ARMv4 */ diff --git a/celt/arm/celt_lpc_arm.h b/celt/arm/celt_lpc_arm.h new file mode 100644 index 0000000..101df3d --- /dev/null +++ b/celt/arm/celt_lpc_arm.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(CELT_LPC_ARM_H) +# define CELT_LPC_ARM_H + +# include "armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +void celt_fir_neon( + const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *_y, + int N, + int ord, + int arch); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_CELT_FIR (1) +# define celt_fir(x, num, y, N, ord, arch) \ + ((void)(arch),PRESUME_NEON(celt_fir)(x, num, y, N, ord, arch)) +# endif + +#if !defined(OVERRIDE_CELT_FIR) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \ + || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \ + && !defined(OPUS_ARM_PRESUME_NEON_INTR))) +extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val16 *, int, int, int); + +# define OVERRIDE_CELT_FIR +# define celt_fir(x, num, y, N, ord, arch) \ + ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, arch)) +# endif +#endif +#endif /* end FIXED_POINT */ + +#endif /* end CELT_LPC_ARM_H */ diff --git a/celt/arm/celt_lpc_neon_intr.c b/celt/arm/celt_lpc_neon_intr.c new file mode 100644 index 0000000..4715d0b --- /dev/null +++ b/celt/arm/celt_lpc_neon_intr.c @@ -0,0 +1,254 @@ +/* Copyright (c) 2016 Google Inc. */ +/** + @file celt_lpc_neon_intr.c + @brief ARM Neon Intrinsic optimizations for celt lpc functions + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <arm_neon.h> +#include "celt_lpc.h" +#include "stack_alloc.h" + +#if defined(FIXED_POINT) + +void celt_fir_neon( + const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + int arch) +{ + int i,j; + const int leftover = N & 7; + const opus_val16 *x = _x-ord; + VARDECL(opus_val16, rnum); + SAVE_STACK; + /* Extend rnum by 3 zeros to handle the case that (ord % 4) is non-zero. */ + ALLOC(rnum, ord+3, opus_val16); + for (i=0;i<ord-3;i+=4) + vst1_s16(rnum+i, vrev64_s16(vld1_s16(num+ord-i-4))); + for (;i<ord;i++) + rnum[i] = num[ord-i-1]; + rnum[ord] = rnum[ord+1] = rnum[ord+2] = 0; + (void)arch; + +#ifdef SMALL_FOOTPRINT + for (i=0;i<N-7;i+=8) + { + int16x8_t x_s16x8 = vld1q_s16(_x+i); + int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT); + int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + vst1q_s16(y+i, vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT))); + } + if (leftover) + { + if (leftover > 4) + { + int16x8_t x_s16x8 = vld1q_s16(_x+i); + int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT); + int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + const int16x8_t y_s16x8 = vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT)); + vst1_s16(y+i, vget_low_s16(y_s16x8)); + vst1q_lane_s16(y+i+4, y_s16x8, 4); + if (leftover >= 6) + { + vst1q_lane_s16(y+i+5, y_s16x8, 5); + if (leftover == 7) + { + vst1q_lane_s16(y+i+6, y_s16x8, 6); + } + } + } + else { + int32x4_t sum0_s32x4 = vshll_n_s16(vld1_s16(_x+i), SIG_SHIFT); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3); + } + const int16x4_t y_s16x4 = vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT); + if (leftover == 4) + { + vst1_s16(y+i, y_s16x4); + } + else + { + vst1_lane_s16(y+i, y_s16x4, 0); + if (leftover >= 2) + { + vst1_lane_s16(y+i+1, y_s16x4, 1); + if (leftover == 3) + { + vst1_lane_s16(y+i+2, y_s16x4, 2); + } + } + } + } + } +#else + for (i=0;i<N-7;i+=8) + { + int32x4_t sum0_s32x4, sum1_s32x4; + sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + int16x8_t x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT); + sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT); + const int16x8_t x_s16x8 = vld1q_s16(_x+i); + sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8)); + sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8)); + vst1q_s16(y+i, vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4))); + } + if (leftover) + { + if (leftover > 4) + { + int32x4_t sum0_s32x4, sum1_s32x4; + sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + int16x8_t x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT); + sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT); + const int16x8_t x_s16x8 = vld1q_s16(_x+i); + sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8)); + sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8)); + const int16x8_t y_s16x8 = vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4)); + vst1_s16(y+i, vget_low_s16(y_s16x8)); + vst1q_lane_s16(y+i+4, y_s16x8, 4); + if (leftover >= 6) + { + vst1q_lane_s16(y+i+5, y_s16x8, 5); + if (leftover == 7) + { + vst1q_lane_s16(y+i+6, y_s16x8, 6); + } + } + } + else { + int32x4_t sum0_s32x4 = vdupq_n_s32(0); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3); + } + sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT); + sum0_s32x4 = vaddw_s16(sum0_s32x4, vld1_s16(_x+i)); + const int16x4_t y_s16x4 = vqmovn_s32(sum0_s32x4); + if (leftover == 4) + { + vst1_s16(y+i, y_s16x4); + } + else + { + vst1_lane_s16(y+i, y_s16x4, 0); + if (leftover >= 2) + { + vst1_lane_s16(y+i+1, y_s16x4, 1); + if (leftover == 3) + { + vst1_lane_s16(y+i+2, y_s16x4, 2); + } + } + } + } + } +#endif + RESTORE_STACK; +} + +#endif diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index a4c5fd6..76a73c0 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -35,6 +35,11 @@ #include "x86/celt_lpc_sse.h" #endif +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "arm/celt_lpc_arm.h" +#endif + #define LPC_ORDER 24 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c index 6166eb0..582618e 100644 --- a/celt/tests/test_unit_dft.c +++ b/celt/tests/test_unit_dft.c @@ -52,6 +52,7 @@ # include "celt_lpc.c" # include "pitch.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "mdct.c" diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index fd3319d..da92f16 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -66,6 +66,7 @@ #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/armcpu.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "kiss_fft.c" diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index 8dbb9ca..0658c7a 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -53,6 +53,7 @@ # include "pitch.c" # include "celt_lpc.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "arm/celt_ne10_fft.c" diff --git a/celt/tests/test_unit_optimization_lpc.c b/celt/tests/test_unit_optimization_lpc.c new file mode 100644 index 0000000..146526e --- /dev/null +++ b/celt/tests/test_unit_optimization_lpc.c @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include <stdio.h> + +#define CELT_C +#include "stack_alloc.h" +#include "mathops.c" +#include "entcode.c" + +#ifdef FIXED_POINT + +#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +# include "celt_lpc.c" +# include "pitch.c" +# include "x86/x86cpu.c" +# include "x86/celt_lpc_sse.c" +# include "x86/pitch_sse2.c" +# include "x86/pitch_sse4_1.c" +# include "x86/x86_celt_map.c" +#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "celt_lpc.c" +# include "pitch.c" +# include "arm/armcpu.c" +# include "arm/arm_celt_map.c" +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" +# endif +#endif + +#define MAX_ORDER 32 + +void test_fir(int arch) +{ + opus_val16 x[MAX_PERIOD+MAX_ORDER]; + opus_val16 num[MAX_ORDER]; + opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD]; + int N, ord; + + unsigned int i; + for(ord=0;ord<=MAX_ORDER;ord++) + { + printf("ord=%2d", ord); + for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */ + { + for (i=0;i<MAX_PERIOD+MAX_ORDER;++i) + { + x[i] = (rand() % 32767) - 16384; + } + for (i=0;i<MAX_PERIOD;++i) + { + yorg[i] = (rand() % 32767) - 16384; + } + for (i=0;i<MAX_ORDER;++i) + { + num[i] = (rand() % 32767) - 16384; + } + memcpy(yopt, yorg, sizeof(yorg)); + + celt_fir_c(x+MAX_ORDER, num, yorg, N, ord, arch); + celt_fir (x+MAX_ORDER, num, yopt, N, ord, arch); + if (memcmp(yorg, yopt, sizeof(yorg))) + { + printf(" N=%3d failed!\nError in lpc unit test!!!\n", N); + for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++) + { + if (yorg[i] != yopt[i]) + { + printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i], i, yopt[i]); + } + } + exit(0); + } + } + printf(" passed!\n"); + } +} +#endif /* FIXED_POINT */ + +int main(void) +{ +#ifdef FIXED_POINT + ALLOC_STACK; + int arch = opus_select_arch(); + test_fir(arch); +#endif /* FIXED_POINT */ + return 0; +} diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 1080c20..3a85a29 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -64,6 +64,7 @@ #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/armcpu.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "kiss_fft.c" diff --git a/celt_headers.mk b/celt_headers.mk index 0eca6e6..4067e5a 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -34,6 +34,7 @@ celt/static_modes_fixed.h \ celt/static_modes_float_arm_ne10.h \ celt/static_modes_fixed_arm_ne10.h \ celt/arm/armcpu.h \ +celt/arm/celt_lpc_arm.h \ celt/arm/fixed_armv4.h \ celt/arm/fixed_armv5e.h \ celt/arm/kiss_fft_armv4.h \ diff --git a/celt_sources.mk b/celt_sources.mk index 2ffe99a..37c0129 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -37,6 +37,7 @@ CELT_AM_SOURCES_ARM_ASM = \ celt/arm/armopts.s.in CELT_SOURCES_ARM_NEON_INTR = \ +celt/arm/celt_lpc_neon_intr.c \ celt/arm/celt_neon_intr.c CELT_SOURCES_ARM_NE10= \ -- 2.8.0.rc3.226.g39d4020
Hi, Linfeng — Please note the aarch64 optimization patches I submitted in November and December (which Tim still hasn’t gotten around to reviewing). As they used Neon intrinsics, several of these actually applied to both armv7 and aarch64 Neon. In particular, note http://lists.xiph.org/pipermail/opus/2015-December/003339.html , which added a Neon-optimized version of xcorr_kernel. xcorr_kernel is used in celt_fir, celt_iir, and celt_pitch_xcorr.> On Jun 17, 2016, at 5:09 PM, Linfeng Zhang <linfengz at google.com> wrote: > > Hi all, > > This is Linfeng Zhang from Google. I'll work on ARM NEON optimization in the > next few months. > > I'm submitting 2 patches in the following couple of emails, which have the new > created celt_fir_neon(). > > I revised celt_fir_c() to not pass in argument "mem" in Patch 1. If there are > concerns to this change, please let me know. > > Many thanks to your comments. > > Linfeng Zhang > > _______________________________________________ > opus mailing list > opus at xiph.org > http://lists.xiph.org/mailman/listinfo/opus
Hi Jonathan, Thanks for pointing me to the related submissions! I noticed the xcorr_kernel_neon() assembly in current code base, but don't know why it's not activated for celt_fir() etc. So I decided to inline it inside celt_fir() to save the sum[] buffer initializing/saving/loading, and handle variable filter order "ord" differently. Thanks, Linfeng On Fri, Jun 17, 2016 at 2:37 PM, Jonathan Lennox <jonathan at vidyo.com> wrote:> Hi, Linfeng — > > Please note the aarch64 optimization patches I submitted in November and > December (which Tim still hasn’t gotten around to reviewing). As they used > Neon intrinsics, several of these actually applied to both armv7 and > aarch64 Neon. > > In particular, note > http://lists.xiph.org/pipermail/opus/2015-December/003339.html , which > added a Neon-optimized version of xcorr_kernel. xcorr_kernel is used in > celt_fir, celt_iir, and celt_pitch_xcorr. > > > On Jun 17, 2016, at 5:09 PM, Linfeng Zhang <linfengz at google.com> wrote: > > > > Hi all, > > > > This is Linfeng Zhang from Google. I'll work on ARM NEON optimization in > the > > next few months. > > > > I'm submitting 2 patches in the following couple of emails, which have > the new > > created celt_fir_neon(). > > > > I revised celt_fir_c() to not pass in argument "mem" in Patch 1. If > there are > > concerns to this change, please let me know. > > > > Many thanks to your comments. > > > > Linfeng Zhang > > > > _______________________________________________ > > opus mailing list > > opus at xiph.org > > http://lists.xiph.org/mailman/listinfo/opus > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20160617/2dc691ef/attachment.html>
Jonathan Lennox wrote:> Hi, Linfeng — > > Please note the aarch64 optimization patches I submitted in November and December (which Tim still hasn’t gotten around to reviewing). As they used Neon intrinsics, several of these actually applied to both armv7 and aarch64 Neon.Yes, it seems I need to move this up my priority list to avoid everyone wasting time on duplicate work. Fortunately, I have a long plane flight tomorrow... If you have any comments on Jonathan's patches (any of them), Linfeng, that would also be good to know.
Reasonably Related Threads
- Several patches of ARM NEON optimization
- [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON
- [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
- [RFC PATCH v1 0/5] aarch64: celt_pitch_xcorr: Fixed point series
- [RFC PATCH v2]: Ne10 fft fixed and previous 0/8]