Viswanath Puttagunta
2014-Nov-21 23:38 UTC
[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics
Hello, I received feedback from engineers working on NE10 [1] that it would be better to use NE10 [1] for FFT optimizations for opus use cases. However, these FFT patches are currently in review and haven't been integrated into NE10 yet. While the FFT functions in NE10 are getting baked, I wanted to optimize the celt_pitch_xcorr (floating point only) and use it to introduce ARM NEON intrinsics into libopus project. This will also lay the foundation when FFT routines in NE10 become available. I would like to know the following as I DID NOT fully use-case test this celt_pitch_xcorr optimization. Did only unit tests. a. Simplest use case to validate this optimization for correctness. b. Simplest use case to validate this optimization for performance. Would prefer something like opusdec that can be executed on command line. Any other feedback welcome. [1]: http://projectne10.github.io/Ne10/ Viswanath Puttagunta (1): armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics Makefile.am | 23 +++++++++++ celt/_kiss_fft_guts.h | 4 +- celt/arch.h | 4 +- celt/arm/arm_celt_map.c | 15 +++++++- celt/arm/celt_neon_intr.c | 81 +++++++++++++++++++++++++++++++++++++++ celt/arm/pitch_arm.h | 15 +++++++- celt/pitch.h | 17 ++++++-- celt/tests/test_unit_mathops.c | 2 +- celt/tests/test_unit_rotation.c | 2 +- celt_sources.mk | 3 ++ configure.ac | 56 ++++++++++++++++++++++----- silk/SigProc_FIX.h | 4 +- silk/macros.h | 4 +- 13 files changed, 205 insertions(+), 25 deletions(-) create mode 100644 celt/arm/celt_neon_intr.c -- 1.7.9.5
Viswanath Puttagunta
2014-Nov-21 23:38 UTC
[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Optimize celt_pitch_xcorr function (for floating point) using ARM NEON intrinsics for SoCs that have NEON VFP unit. As initial step, targeting ARMv7 NEON (VFP3+) based SoCs. To enable this optimization, use --enable-arm-neon-intrinsics configure option. This flag is not enabled by default. Compile time and runtime checks are also supported to make sure this optimization is only enabled when the compiler supports neon intrinsics. --- Makefile.am | 23 +++++++++++ celt/_kiss_fft_guts.h | 4 +- celt/arch.h | 4 +- celt/arm/arm_celt_map.c | 15 +++++++- celt/arm/celt_neon_intr.c | 81 +++++++++++++++++++++++++++++++++++++++ celt/arm/pitch_arm.h | 15 +++++++- celt/pitch.h | 17 ++++++-- celt/tests/test_unit_mathops.c | 2 +- celt/tests/test_unit_rotation.c | 2 +- celt_sources.mk | 3 ++ configure.ac | 56 ++++++++++++++++++++++----- silk/SigProc_FIX.h | 4 +- silk/macros.h | 4 +- 13 files changed, 205 insertions(+), 25 deletions(-) create mode 100644 celt/arm/celt_neon_intr.c diff --git a/Makefile.am b/Makefile.am index e20f7b4..0e9e120 100644 --- a/Makefile.am +++ b/Makefile.am @@ -38,6 +38,12 @@ CELT_SOURCES += $(CELT_SOURCES_SSE) endif endif +if OPUS_ARM_NEON_INTR +noinst_LTLIBRARIES = libarmneon.la +libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR) +libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS) -I$(top_srcdir)/include +endif + if CPU_ARM CELT_SOURCES += $(CELT_SOURCES_ARM) SILK_SOURCES += $(SILK_SOURCES_ARM) @@ -59,6 +65,9 @@ include opus_headers.mk libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES) libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@ libopus_la_LIBADD = $(LIBM) +if OPUS_ARM_NEON_INTR +libopus_la_LIBADD += ./libarmneon.la +endif pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_types.h include/opus_defines.h @@ -97,6 +106,11 @@ celt_tests_test_unit_cwrs32_LDADD = $(LIBM) celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c celt_tests_test_unit_dft_LDADD = $(LIBM) +if OPUS_ARM_NEON_INTR +celt_tests_test_unit_dft_LDADD += ./libarmneon.la +endif + + celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c celt_tests_test_unit_entropy_LDADD = $(LIBM) @@ -110,10 +124,16 @@ if CPU_ARM if OPUS_ARM_EXTERNAL_ASM celt_tests_test_unit_mathops_LDADD += libopus.la endif +if OPUS_ARM_NEON_INTR +celt_tests_test_unit_mathops_LDADD += ./libarmneon.la +endif endif celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c celt_tests_test_unit_mdct_LDADD = $(LIBM) +if OPUS_ARM_NEON_INTR +celt_tests_test_unit_mdct_LDADD += ./libarmneon.la +endif celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c celt_tests_test_unit_rotation_LDADD = $(LIBM) @@ -121,6 +141,9 @@ if CPU_ARM if OPUS_ARM_EXTERNAL_ASM celt_tests_test_unit_rotation_LDADD += libopus.la endif +if OPUS_ARM_NEON_INTR +celt_tests_test_unit_rotation_LDADD += ./libarmneon.la +endif endif celt_tests_test_unit_types_SOURCES = celt/tests/test_unit_types.c diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h index 5e3d58f..11a2676 100644 --- a/celt/_kiss_fft_guts.h +++ b/celt/_kiss_fft_guts.h @@ -90,11 +90,11 @@ do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \ }while(0) -#if defined(OPUS_ARM_INLINE_ASM) +#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT) #include "arm/kiss_fft_armv4.h" #endif -#if defined(OPUS_ARM_INLINE_EDSP) +#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT) #include "arm/kiss_fft_armv5e.h" #endif #if defined(MIPSr1_ASM) diff --git a/celt/arch.h b/celt/arch.h index 9f74ddd..0e3061c 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -118,9 +118,9 @@ static OPUS_INLINE opus_int16 SAT16(opus_int32 x) { #include "fixed_generic.h" -#ifdef OPUS_ARM_INLINE_EDSP +#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT) #include "arm/fixed_armv5e.h" -#elif defined (OPUS_ARM_INLINE_ASM) +#elif defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT) #include "arm/fixed_armv4.h" #elif defined (BFIN_ASM) #include "fixed_bfin.h" diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index 547a84d..a6313e7 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -42,8 +42,19 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ }; # else -# error "Floating-point implementation is not supported by ARM asm yet." \ - "Reconfigure with --disable-rtcd or send patches." + +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int , int, int) = { + celt_pitch_xcorr_c, /* ARMv4 */ + celt_pitch_xcorr_c, /* EDSP */ + celt_pitch_xcorr_c, /* Media */ +#if defined(OPUS_ARM_NEON_INTR) + celt_pitch_xcorr_float_neon /* Neon */ +#else + celt_pitch_xcorr_c +#endif +}; + # endif #endif diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c new file mode 100644 index 0000000..88954fb --- /dev/null +++ b/celt/arm/celt_neon_intr.c @@ -0,0 +1,81 @@ +#include <arm_neon.h> +#include "../arch.h" + +static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int len) { + float32x4_t YY[5]; + float32x4_t XX[4]; + float32x2_t XX_2; + float32x4_t SUMM[4]; + float *xi = x; + float *yi = y; + int cd = len/4; + int cr = len%4; + int j; + + celt_assert(len>=3); + + /* Initialize sums to 0 */ + SUMM[0] = vdupq_n_f32(0); + SUMM[1] = vdupq_n_f32(0); + SUMM[2] = vdupq_n_f32(0); + SUMM[3] = vdupq_n_f32(0); + + YY[0] = vld1q_f32(yi); + + /* Each loop consumes 8 floats in y vector + * and 4 floats in x vector + */ + for (j = 0; j < cd; j++) { + yi += 4; + YY[4] = vld1q_f32(yi); + YY[1] = vextq_f32(YY[0], YY[4], 1); + YY[2] = vextq_f32(YY[0], YY[4], 2); + YY[3] = vextq_f32(YY[0], YY[4], 3); + + XX[0] = vld1q_dup_f32(xi++); + XX[1] = vld1q_dup_f32(xi++); + XX[2] = vld1q_dup_f32(xi++); + XX[3] = vld1q_dup_f32(xi++); + + SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]); + SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]); + SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]); + SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]); + YY[0] = YY[4]; + } + + /* Handle remaining values max iterations = 3 */ + for (j = 0; j < cr; j++) { + YY[0] = vld1q_f32(yi++); + XX_2 = vld1_lane_f32(xi++, XX_2, 0); + SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0); + } + + SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]); + SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]); + SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]); + + vst1q_f32(sum, SUMM[0]); +} + +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch, int arch) { + int i, j; + + celt_assert(max_pitch > 0); + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); + + for (i = 0; i < (max_pitch-3); i += 4) { + xcorr_kernel_neon_float((float *)_x, (float *)_y+i, + (float *)xcorr+i, len); + } + + /* In case max_pitch isn't multiple of 4, do unrolled version */ + for (; i < max_pitch; i++) { + float sum = 0; + float *yi = _y+i; + for (j = 0; j < len; j++) + sum += _x[i]*yi[i]; + xcorr[i] = sum; + } +} diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h index a07f8ac..f5adc48 100644 --- a/celt/arm/pitch_arm.h +++ b/celt/arm/pitch_arm.h @@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) # endif -# endif +#else /* Start !FIXED_POINT */ +/* Float case */ +#if defined(OPUS_ARM_NEON_INTR) +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch, int arch); +#endif + +#if !defined(OPUS_HAVE_RTCD) && defined(OPUS_PRESUME_ARM_NEON_INTR) +#define OVERRIDE_PITCH_XCORR (1) +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \ + len, max_pitch, arch)) +#endif +#endif /*end of !FIXED_POINT*/ #endif diff --git a/celt/pitch.h b/celt/pitch.h index 027ebd9..752b54e 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -46,7 +46,7 @@ #include "mips/pitch_mipsr1.h" #endif -#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +#if defined(OPUS_ARM_ASM) # include "arm/pitch_arm.h" #endif @@ -189,11 +189,22 @@ opus_val32 void # endif (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, - const opus_val16 *, opus_val32 *, int, int); + const opus_val16 *, opus_val32 *, int, int +#if !defined(FIXED_POINT) + ,int +#endif +); +#if defined(FIXED_POINT) +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ + xcorr, len, max_pitch) +#else # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ - xcorr, len, max_pitch)) + xcorr, len, max_pitch, arch)) +#endif + # else # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch)) diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index 3076bbf..cdb2939 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -56,7 +56,7 @@ #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" -#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +#elif defined(OPUS_ARM_ASM) #include "arm/arm_celt_map.c" #endif diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 37ba74e..906fa7e 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -54,7 +54,7 @@ #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" -#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +#elif defined(OPUS_ARM_ASM) #include "arm/arm_celt_map.c" #endif diff --git a/celt_sources.mk b/celt_sources.mk index 20b1b1b..3d4deca 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -30,5 +30,8 @@ celt/arm/arm_celt_map.c CELT_SOURCES_ARM_ASM = \ celt/arm/celt_pitch_xcorr_arm.s +CELT_SOURCES_ARM_NEON_INTR = \ +celt/arm/celt_neon_intr.c + CELT_AM_SOURCES_ARM_ASM = \ celt/arm/armopts.s.in diff --git a/configure.ac b/configure.ac index 9b2f51f..09657b6 100644 --- a/configure.ac +++ b/configure.ac @@ -198,12 +198,11 @@ cpu_arm=no AS_IF([test x"${enable_asm}" = x"yes"],[ inline_optimization="No ASM for your platform, please send patches" + OPUS_ARM_NEON_INTR_CPPFLAGS case $host_cpu in arm*) - dnl Currently we only have asm for fixed-point - AS_IF([test "$enable_float" != "yes"],[ cpu_arm=yes - AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm optimization]) + AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm/intrinsic optimization]) AS_GCC_INLINE_ASSEMBLY( [inline_optimization="ARM"], [inline_optimization="disabled"] @@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1], [OPUS_ARM_INLINE_MEDIA=0]) AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0]) + + AC_ARG_ENABLE([arm-neon-intrinsics], + AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON optimisations on ARM CPUs that support it])) + + AS_IF([test x"$enable_arm_neon_intrinsics" = x"yes"], + [ + AC_MSG_CHECKING(if compiler supports arm neon intrinsics) + save_CFLAGS="$CFLAGS" + save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])], + [ + OPUS_ARM_NEON_INTR=1 + OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3" + AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS) + ], + [ + OPUS_ARM_NEON_INTR=0 + ]) + CFLAGS="$save_CFLAGS" + AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])]) + ], + [ + OPUS_ARM_NEON_INTR=0 + AC_MSG_WARN([ARMv7 neon intrinsics not enabled]) + ]) + AS_IF([test x"$inline_optimization" = x"ARM"],[ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true]) AC_DEFINE([OPUS_ARM_INLINE_ASM], 1, @@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1], [Use ARMv5E inline asm optimizations]) inline_optimization="$inline_optimization (EDSP)" - ]) + ]n) AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[ AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1], [Use ARMv6 inline asm optimizations]) @@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ [*** ARM assembly requires perl -- disabling optimizations]) asm_optimization="(missing perl dependency for ARM)" ]) - ]) + AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [ + AC_DEFINE([OPUS_ARM_NEON_INTR], 1, + [Compiler supports ARMv7 Neon Intrinsics]), + AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [ + AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1, + [Compiler support arm Intrinsics and target must support neon])], + []) + AS_IF([test x"enable_rtcd" != x""], + [rtcd_support="$rtcd_support (NEON_INTR)"], + []) + ],[]) ;; esac -],[ - inline_optimization="disabled" - asm_optimization="disabled" -]) +],[]) AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], @@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], [test x"${asm_optimization%% *}" = x"ARM"]) +AM_CONDITIONAL([OPUS_ARM_NEON_INTR], + [test x"$OPUS_ARM_NEON_INTR" = x"1"]) + AM_CONDITIONAL([HAVE_SSE4_1], [false]) AM_CONDITIONAL([HAVE_SSE2], [false]) AS_IF([test x"$enable_intrinsics" = x"yes"],[ diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index b632994..4c03a33 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -595,11 +595,11 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) #include "MacroCount.h" #include "MacroDebug.h" -#ifdef OPUS_ARM_INLINE_ASM +#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT) #include "arm/SigProc_FIX_armv4.h" #endif -#ifdef OPUS_ARM_INLINE_EDSP +#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT) #include "arm/SigProc_FIX_armv5e.h" #endif diff --git a/silk/macros.h b/silk/macros.h index 2f24950..217b851 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -138,11 +138,11 @@ static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32) (*((Matrix_base_adr) + ((row)+(M)*(column)))) #endif -#ifdef OPUS_ARM_INLINE_ASM +#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT) #include "arm/macros_armv4.h" #endif -#ifdef OPUS_ARM_INLINE_EDSP +#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT) #include "arm/macros_armv5e.h" #endif -- 1.7.9.5
Timothy B. Terriberry
2014-Nov-22 00:06 UTC
[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics
Viswanath Puttagunta wrote:> a. Simplest use case to validate this optimization for correctness. > b. Simplest use case to validate this optimization for performance. > > Would prefer something like opusdec that can be executed on command > line.The easiest thing to use is probably opus_demo (opusdec does a bunch of extra things, plus for interactive use we care about both the encoder and decoder, and celt_pitch_xcorr gets used vastly more by the encoder than the decoder... I think the decoder only uses it for PLC). Something like ./opus_demo restricted-lowdelay 48000 2 96000 comp48-stereo.sw /dev/null comp48-stereo.sw can be found here: https://people.xiph.org/~tterribe/opus/comp48-stereo.sw celt_pitch_xcorr also gets used by the SILK encoder (more in fixed-point than float, but the float one uses it, too). So it may be worth doing a run with the application set to voip instead of restricted-lowdelay and a lower bitrate (e.g., 24000 instead of 96000). Even though this primarily affects the encoder, as a sanity check, it's always good to make sure the test vectors still decode correctly. Get them from <http://opus-codec.org/testvectors/opus_testvectors.tar.gz> and use tests/run_vectors.sh <build path> <test vectors path> 48000
Viswanath Puttagunta
2014-Nov-24 20:53 UTC
[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics
On 21 November 2014 at 18:06, Timothy B. Terriberry <tterribe at xiph.org> wrote:> > Viswanath Puttagunta wrote: >> >> a. Simplest use case to validate this optimization for correctness. >> b. Simplest use case to validate this optimization for performance. >> >> Would prefer something like opusdec that can be executed on command >> line. > > > The easiest thing to use is probably opus_demo (opusdec does a bunch of extra things, plus for interactive use we care about both the encoder and decoder, and celt_pitch_xcorr gets used vastly more by the encoder than the decoder... I think the decoder only uses it for PLC). > > Something like > ./opus_demo restricted-lowdelay 48000 2 96000 comp48-stereo.sw /dev/null > > comp48-stereo.sw can be found here: https://people.xiph.org/~tterribe/opus/comp48-stereo.sw > > celt_pitch_xcorr also gets used by the SILK encoder (more in fixed-point than float, but the float one uses it, too). So it may be worth doing a run with the application set to voip instead of restricted-lowdelay and a lower bitrate (e.g., 24000 instead of 96000).Thanks for your feedback. I have verified both above cases. While I used ./opus_demo restricted-lowdelay 48000 2 96000 comp48-stereo.sw out.wav, ./opus_demo voip 48000 2 24000 comp48-stereo.sw out.wav to make sure the output out.wav is clearly audible, I used below command (encode only) for performance benchmarking. ./opus_demo -e restricted-lowdelay 48000 2 96000 comp48-stereo.sw opus_raw.out ./opus_demo -e voip 48000 2 96000 comp48-stereo.sw opus_raw.out I saw much better improvement in performance (16.16%) for overall encode use case for "restricted-lowdelay 48000 2 96000" for CELT encoding as you suspected as celt_pitch_xcorr function gets used much more. I observed lesser improvement in performance (3.42%) for overall encode use case for "voip 48000 2 24000". This is somewhat expected as cel_pitch_xcorr_c was not the main contributor for performance in this SILK encoder use case. For detailed information on how I measured performance on my Beaglebone Black (Cortex-A8), please see "celt_pitch_xcorr (float) Neon Optimization" section of [1] [1]: https://docs.google.com/document/d/1L6csATjSsXtzg_sa1iHZta8hOsoVWA4UjHXEakpTrNk/edit?usp=sharing> > Even though this primarily affects the encoder, as a sanity check, it's always good to make sure the test vectors still decode correctly. Get them from <http://opus-codec.org/testvectors/opus_testvectors.tar.gz> and use > tests/run_vectors.sh <build path> <test vectors path> 48000
Timothy B. Terriberry
2014-Nov-28 21:52 UTC
[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Review comments inline.> +if OPUS_ARM_NEON_INTR > +noinst_LTLIBRARIES = libarmneon.la > +libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR) > +libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS) -I$(top_srcdir)/include > +endifI don't think these should be in a separate library. It brings with it lots of complications (to name one: wouldn't the .pc files need to be updated?). Please use the same mechanism that the SSE intrinsics use to add CFLAGS to the compilation of specific object files, e.g., $(SSE_OBJ): CFLAGS += -msse4.1> +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, > + const opus_val16 *, opus_val32 *, int , int, int) = { > + celt_pitch_xcorr_c, /* ARMv4 */ > + celt_pitch_xcorr_c, /* EDSP */ > + celt_pitch_xcorr_c, /* Media */ > +#if defined(OPUS_ARM_NEON_INTR) > + celt_pitch_xcorr_float_neon /* Neon */Please do not use tabs in source code (this applies here and everywhere below). Even with the tabs expanded in context, the comments here do not line up properly.> +static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int len) {x and y should be const.> + float32x4_t YY[5]; > + float32x4_t XX[4]; > + float32x2_t XX_2; > + float32x4_t SUMM[4]; > + float *xi = x; > + float *yi = y; > + int cd = len/4; > + int cr = len%4;len is signed, so / and % are NOT equivalent to the corresponding >> and & (they are much slower).> + int j; > + > + celt_assert(len>=3); > + > + /* Initialize sums to 0 */ > + SUMM[0] = vdupq_n_f32(0); > + SUMM[1] = vdupq_n_f32(0); > + SUMM[2] = vdupq_n_f32(0); > + SUMM[3] = vdupq_n_f32(0); > + > + YY[0] = vld1q_f32(yi); > + > + /* Each loop consumes 8 floats in y vector > + * and 4 floats in x vector > + */ > + for (j = 0; j < cd; j++) { > + yi += 4; > + YY[4] = vld1q_f32(yi);If len == 4, then in the first iteration you will have loaded 8 y values, but only 7 are guaranteed to be available (e.g., the C code only references y[0] up to y[len-1+3]). You need to end this loop early and fall back to another approach. See comments in celt_pitch_xcorr_arm.s for details and an example (there are other useful comments there that could shave another cycle or two from this inner loop).> + YY[1] = vextq_f32(YY[0], YY[4], 1); > + YY[2] = vextq_f32(YY[0], YY[4], 2); > + YY[3] = vextq_f32(YY[0], YY[4], 3); > + > + XX[0] = vld1q_dup_f32(xi++); > + XX[1] = vld1q_dup_f32(xi++); > + XX[2] = vld1q_dup_f32(xi++); > + XX[3] = vld1q_dup_f32(xi++);Don't do this. Do a single load and use vmlaq_lane_f32() to multiply by each value. That should cut at least 5 cycles out of this loop.> + > + SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]); > + SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]); > + SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]); > + SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]); > + YY[0] = YY[4]; > + } > + > + /* Handle remaining values max iterations = 3 */ > + for (j = 0; j < cr; j++) { > + YY[0] = vld1q_f32(yi++);This load is always redundant in the first iteration, which is a bit unfortunate.> + XX_2 = vld1_lane_f32(xi++, XX_2, 0);Don't load a single lane when you don't need the value(s) in the other lane(s). Use vld1_dup_f32() instead. It's faster and breaks dependencies.> + SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0); > + } > + > + SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]); > + SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]); > + SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]); > + > + vst1q_f32(sum, SUMM[0]); > +} > + > +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, > + opus_val32 *xcorr, int len, int max_pitch, int arch) {arch is unused. There's no reason to pass it here. If we're here, we know what the arch is.> + int i, j; > + > + celt_assert(max_pitch > 0); > + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); > + > + for (i = 0; i < (max_pitch-3); i += 4) { > + xcorr_kernel_neon_float((float *)_x, (float *)_y+i, > + (float *)xcorr+i, len); > + } > + > + /* In case max_pitch isn't multiple of 4, do unrolled version */ > + for (; i < max_pitch; i++) { > + float sum = 0; > + float *yi = _y+i; > + for (j = 0; j < len; j++) > + sum += _x[i]*yi[i]; > + xcorr[i] = sum; > + }This loop can still be largely vectorized. Any reason not to do so?> +} > diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h > index a07f8ac..f5adc48 100644 > --- a/celt/arm/pitch_arm.h > +++ b/celt/arm/pitch_arm.h > @@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, > ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) > # endif > > -# endif > +#else /* Start !FIXED_POINT */ > +/* Float case */ > +#if defined(OPUS_ARM_NEON_INTR) > +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, > + opus_val32 *xcorr, int len, int max_pitch, int arch); > +#endif > + > +#if !defined(OPUS_HAVE_RTCD) && defined(OPUS_PRESUME_ARM_NEON_INTR) > +#define OVERRIDE_PITCH_XCORR (1) > +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ > + ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \ > + len, max_pitch, arch))Again, don't pass arch.> (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, > - const opus_val16 *, opus_val32 *, int, int); > + const opus_val16 *, opus_val32 *, int, int > +#if !defined(FIXED_POINT) > + ,int > +#endif > +);Which gets rid of this ugliness.> +#if defined(FIXED_POINT) > +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ > + ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ > + xcorr, len, max_pitch) > +#else > # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ > ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ > - xcorr, len, max_pitch)) > + xcorr, len, max_pitch, arch)) > +#endif > +And this.> diff --git a/configure.ac b/configure.ac > index 9b2f51f..09657b6 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -198,12 +198,11 @@ cpu_arm=no > > AS_IF([test x"${enable_asm}" = x"yes"],[ > inline_optimization="No ASM for your platform, please send patches" > + OPUS_ARM_NEON_INTR_CPPFLAGS> case $host_cpu in > arm*) > - dnl Currently we only have asm for fixed-point > - AS_IF([test "$enable_float" != "yes"],[ > cpu_arm=yes > - AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm optimization]) > + AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm/intrinsic optimization])Not sure I'm in love with conflating intrinsics with inline assembly. For example, are these tests (especially the PRESUME_NEON stuff) going to do the right thing on aarch64?> AS_GCC_INLINE_ASSEMBLY( > [inline_optimization="ARM"], > [inline_optimization="disabled"] > @@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ > AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1], > [OPUS_ARM_INLINE_MEDIA=0]) > AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0]) > + > + AC_ARG_ENABLE([arm-neon-intrinsics], > + AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON optimisations on ARM CPUs that support it]))This should specify a default value for enable_arm_neon_intrinsics. However, I really think this switch should be unified with the --enable-intrinsics switch currently used by x86.> + > + AS_IF([test x"$enable_arm_neon_intrinsics" = x"yes"], > + [ > + AC_MSG_CHECKING(if compiler supports arm neon intrinsics) > + save_CFLAGS="$CFLAGS" > + save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" > + AC_COMPILE_IFELSE( > + [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])], > + [ > + OPUS_ARM_NEON_INTR=1 > + OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3" > + AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS) > + ], > + [ > + OPUS_ARM_NEON_INTR=0 > + ]) > + CFLAGS="$save_CFLAGS" > + AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"], > + [AC_MSG_RESULT([yes])], > + [AC_MSG_RESULT([no])]) > + ], > + [ > + OPUS_ARM_NEON_INTR=0 > + AC_MSG_WARN([ARMv7 neon intrinsics not enabled]) > + ]) > + > AS_IF([test x"$inline_optimization" = x"ARM"],[ > AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true]) > AC_DEFINE([OPUS_ARM_INLINE_ASM], 1, > @@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ > AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1], > [Use ARMv5E inline asm optimizations]) > inline_optimization="$inline_optimization (EDSP)" > - ]) > + ]n)Buh?> AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[ > AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1], > [Use ARMv6 inline asm optimizations]) > @@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ > [*** ARM assembly requires perl -- disabling optimizations]) > asm_optimization="(missing perl dependency for ARM)" > ]) > - ]) > + AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [ > + AC_DEFINE([OPUS_ARM_NEON_INTR], 1, > + [Compiler supports ARMv7 Neon Intrinsics]), > + AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [ > + AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1, > + [Compiler support arm Intrinsics and target must support neon])], > + []) > + AS_IF([test x"enable_rtcd" != x""], > + [rtcd_support="$rtcd_support (NEON_INTR)"], > + []) > + ],[]) > ;; > esac > -],[ > - inline_optimization="disabled" > - asm_optimization="disabled" > -]) > +],[]) > > AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) > AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], > @@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], > AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], > [test x"${asm_optimization%% *}" = x"ARM"]) > > +AM_CONDITIONAL([OPUS_ARM_NEON_INTR], > + [test x"$OPUS_ARM_NEON_INTR" = x"1"]) > + > AM_CONDITIONAL([HAVE_SSE4_1], [false]) > AM_CONDITIONAL([HAVE_SSE2], [false]) > AS_IF([test x"$enable_intrinsics" = x"yes"],[
Reasonably Related Threads
- [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [PATCH v1] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCH v3] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCH v2] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCH v2] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics