Viswanath Puttagunta
2014-Dec-07 08:17 UTC
[opus] [RFC PATCH v2] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Hi, Optimizes celt_pitch_xcorr for floating point. Changes from RFCv1: - Rebased on top of commit aad281878: Fix celt_pitch_xcorr_c signature. which got rid of ugly code around CELT_PITCH_XCORR_IMPL passing of "arch" parameter. - Unified with --enable-intrinsics used by x86 - Modified algorithm to be more in-line with algorithm in celt_pitch_xcorr_arm.s Viswanath Puttagunta (1): armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics Makefile.am | 11 ++ celt/arm/arm_celt_map.c | 15 ++- celt/arm/celt_neon_intr.c | 242 +++++++++++++++++++++++++++++++++++++++ celt/arm/pitch_arm.h | 13 ++- celt/cpu_support.h | 3 +- celt/pitch.h | 6 +- celt/tests/test_unit_mathops.c | 10 +- celt/tests/test_unit_rotation.c | 9 +- celt_sources.mk | 3 + configure.ac | 77 +++++++++++-- 10 files changed, 370 insertions(+), 19 deletions(-) create mode 100644 celt/arm/celt_neon_intr.c -- 1.7.9.5
Viswanath Puttagunta
2014-Dec-07 08:17 UTC
[opus] [RFC PATCH v2] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Optimize celt_pitch_xcorr function (for floating point) using ARM NEON intrinsics for SoCs that have NEON VFP unit. To enable this optimization, use --enable-intrinsics configure option. Compile time and runtime checks are also supported to make sure this optimization is only enabled when the compiler supports neon intrinsics. --- Makefile.am | 11 ++ celt/arm/arm_celt_map.c | 15 ++- celt/arm/celt_neon_intr.c | 242 +++++++++++++++++++++++++++++++++++++++ celt/arm/pitch_arm.h | 13 ++- celt/cpu_support.h | 3 +- celt/pitch.h | 6 +- celt/tests/test_unit_mathops.c | 10 +- celt/tests/test_unit_rotation.c | 9 +- celt_sources.mk | 3 + configure.ac | 77 +++++++++++-- 10 files changed, 370 insertions(+), 19 deletions(-) create mode 100644 celt/arm/celt_neon_intr.c diff --git a/Makefile.am b/Makefile.am index e20f7b4..0687814 100644 --- a/Makefile.am +++ b/Makefile.am @@ -41,6 +41,11 @@ endif if CPU_ARM CELT_SOURCES += $(CELT_SOURCES_ARM) SILK_SOURCES += $(SILK_SOURCES_ARM) + +if OPUS_ARM_NEON_INTR +CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR) +endif + if OPUS_ARM_EXTERNAL_ASM nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \ @@ -260,3 +265,9 @@ if HAVE_SSE2 $(SSE_OBJ): CFLAGS += -msse2 endif endif + +if OPUS_ARM_NEON_INTR +CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ + %test_unit_rotation.o %test_unit_mathops.o +$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS) +endif diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index 547a84d..ecdf7ec 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -41,9 +41,18 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ }; -# else -# error "Floating-point implementation is not supported by ARM asm yet." \ - "Reconfigure with --disable-rtcd or send patches." +# else /* !FIXED_POINT */ +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int, int) = { + celt_pitch_xcorr_c, /* ARMv4 */ + celt_pitch_xcorr_c, /* EDSP */ + celt_pitch_xcorr_c, /* Media */ +#if defined(OPUS_ARM_NEON_INTR) + celt_pitch_xcorr_float_neon /* Neon */ +#else + celt_pitch_xcorr_c /* Neon */ +#endif +}; # endif #endif diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c new file mode 100644 index 0000000..af71498 --- /dev/null +++ b/celt/arm/celt_neon_intr.c @@ -0,0 +1,242 @@ +/* Copyright (c) 2014-2015 Xiph.Org Foundation + Written by Viswanath Puttagunta */ +/** + @file celt_neon_intr.c + @brief ARM Neon Intrinsic optimizations for celt + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <arm_neon.h> +#include "../arch.h" + +/* + * Function: xcorr_kernel_neon_float + * --------------------------------- + * Computes 4 correlation values and stores them in sum[4] + */ +void xcorr_kernel_neon_float(const float *x, const float *y, + float sum[4], int len) { + float32x4_t YY[3]; + float32x4_t YEXT[3]; + float32x4_t XX[2]; + float32x2_t XX_2; + float32x4_t SUMM; + float *xi = x; + float *yi = y; + + celt_assert(len>0); + + YY[0] = vld1q_f32(yi); + SUMM = vdupq_n_f32(0); + + /* Consume 8 elements in x vector and 12 elements in y + * vector. However, the 12'th element never really gets + * touched in this loop. So, if len == 8, then we only + * must access y[0] to y[10]. y[11] must not be accessed + * hence make sure len > 8 and not len >= 8 + */ + while (len > 8) { + yi += 4; + YY[1] = vld1q_f32(yi); + yi += 4; + YY[2] = vld1q_f32(yi); + + XX[0] = vld1q_f32(xi); + xi += 4; + XX[1] = vld1q_f32(xi); + xi += 4; + + SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); + YEXT[0] = vextq_f32(YY[0], YY[1], 1); + SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); + YEXT[1] = vextq_f32(YY[0], YY[1], 2); + SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); + YEXT[2] = vextq_f32(YY[0], YY[1], 3); + SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); + + SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0); + YEXT[0] = vextq_f32(YY[1], YY[2], 1); + SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1); + YEXT[1] = vextq_f32(YY[1], YY[2], 2); + SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0); + YEXT[2] = vextq_f32(YY[1], YY[2], 3); + SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1); + + YY[0] = YY[2]; + len -= 8; + } + + /* Consume 4 elements in x vector and 8 elements in y + * vector. However, the 8'th element in y never really gets + * touched in this loop. So, if len == 4, then we only + * must access y[0] to y[6]. y[7] must not be accessed + * hence make sure len>4 and not len>=4 + */ + while (len > 4) { + yi += 4; + YY[1] = vld1q_f32(yi); + + XX[0] = vld1q_f32(xi); + xi += 4; + + SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); + YEXT[0] = vextq_f32(YY[0], YY[1], 1); + SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); + YEXT[1] = vextq_f32(YY[0], YY[1], 2); + SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); + YEXT[2] = vextq_f32(YY[0], YY[1], 3); + SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); + + YY[0] = YY[1]; + len -= 4; + } + + /* Just unroll the rest of the loop */ + yi++; + switch(len) { + case 4: + XX_2 = vld1_dup_f32(xi++); + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); + YY[0] = vld1q_f32(yi++); + case 3: + XX_2 = vld1_dup_f32(xi++); + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); + YY[0] = vld1q_f32(yi++); + case 2: + XX_2 = vld1_dup_f32(xi++); + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); + YY[0] = vld1q_f32(yi++); + case 1: + XX_2 = vld1_dup_f32(xi++); + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); + } + + vst1q_f32(sum, SUMM); +} + +/* + * Function: xcorr3to1_kernel_neon_float + * --------------------------------- + * Computes single correlation values and stores in *sum + */ +void xcorr3to1_kernel_neon_float(const float *x, const float *y, + float *sum, int len) { + int i; + float32x4_t XX[4]; + float32x4_t YY[4]; + float32x4_t SUMM; + float32x2_t ZERO; + float32x2x2_t tv; + float sumi; + float *xi = x; + float *yi = y; + + ZERO = vdup_n_f32(0); + SUMM = vdupq_n_f32(0); + + /* Work on 16 values per cycle */ + while (len >= 16) { + XX[0] = vld1q_f32(xi); + xi += 4; + XX[1] = vld1q_f32(xi); + xi += 4; + XX[2] = vld1q_f32(xi); + xi += 4; + XX[3] = vld1q_f32(xi); + xi += 4; + + YY[0] = vld1q_f32(yi); + yi += 4; + YY[1] = vld1q_f32(yi); + yi += 4; + YY[2] = vld1q_f32(yi); + yi += 4; + YY[3] = vld1q_f32(yi); + yi += 4; + + SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); + SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); + SUMM = vmlaq_f32(SUMM, YY[2], XX[2]); + SUMM = vmlaq_f32(SUMM, YY[3], XX[3]); + len -= 16; + } + + /* Work on 8 values per cycle */ + while (len >= 8) { + XX[0] = vld1q_f32(xi); + xi += 4; + XX[1] = vld1q_f32(xi); + xi += 4; + + YY[0] = vld1q_f32(yi); + yi += 4; + YY[1] = vld1q_f32(yi); + yi += 4; + + SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); + SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); + len -= 8; + } + + /* Work on 4 values per cycle */ + while (len >= 4) { + XX[0] = vld1q_f32(xi); + xi += 4; + YY[0] = vld1q_f32(yi); + yi += 4; + SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); + len -= 4; + } + /* Accumulate results into single float */ + tv.val[0] = vadd_f32(vget_low_f32(SUMM), vget_high_f32(SUMM)); + tv = vtrn_f32(tv.val[0], ZERO); + tv.val[0] = vadd_f32(tv.val[0], tv.val[1]); + + vst1_lane_f32(&sumi, tv.val[0], 0); + for (i = 0; i < len; i++) + sumi += xi[i] * yi[i]; + *sum = sumi; +} + +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch) { + int i; + celt_assert(max_pitch > 0); + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); + + for (i = 0; i < (max_pitch-3); i += 4) { + xcorr_kernel_neon_float((float *)_x, (float *)_y+i, + (float *)xcorr+i, len); + } + + /* In case max_pitch isn't multiple of 4 + * compute single correlation value per cycle + */ + for (; i < max_pitch; i++) { + xcorr3to1_kernel_neon_float((float *)_x, (float *)_y+i, + (float *)xcorr+i, len); + } +} diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h index a07f8ac..125d1bc 100644 --- a/celt/arm/pitch_arm.h +++ b/celt/arm/pitch_arm.h @@ -52,6 +52,17 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) # endif -# endif +#else /* Start !FIXED_POINT */ +/* Float case */ +#if defined(OPUS_ARM_NEON_INTR) +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +#if !defined(OPUS_HAVE_RTCD) +#define OVERRIDE_PITCH_XCORR (1) +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch)) +#endif +#endif +#endif /* end !FIXED_POINT */ #endif diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 71efff1..1d62e2f 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -31,7 +31,8 @@ #include "opus_types.h" #include "opus_defines.h" -#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) +#if defined(OPUS_HAVE_RTCD) && \ + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) #include "arm/armcpu.h" /* We currently support 4 ARM variants: diff --git a/celt/pitch.h b/celt/pitch.h index 5c6e551..4368cc5 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -46,7 +46,8 @@ #include "mips/pitch_mipsr1.h" #endif -#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_NEON_INTR)) # include "arm/pitch_arm.h" #endif @@ -178,7 +179,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, #if !defined(OVERRIDE_PITCH_XCORR) /*Is run-time CPU detection enabled on this platform?*/ -# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) +# if defined(OPUS_HAVE_RTCD) && \ + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) extern # if defined(FIXED_POINT) opus_val32 diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index 3076bbf..2db87be 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -56,10 +56,18 @@ #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" -#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +#endif + +#if defined(OPUS_ARM_NEON_INTR) +#include "arm/celt_neon_intr.c" +#endif + +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_NEON_INTR)) #include "arm/arm_celt_map.c" #endif + #ifdef FIXED_POINT #define WORD "%d" #else diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 37ba74e..bff8469 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -54,7 +54,14 @@ #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" -#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +#endif + +#if defined(OPUS_ARM_NEON_INTR) +#include "arm/celt_neon_intr.c" +#endif + +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_NEON_INTR)) #include "arm/arm_celt_map.c" #endif diff --git a/celt_sources.mk b/celt_sources.mk index 20b1b1b..29ec937 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -32,3 +32,6 @@ celt/arm/celt_pitch_xcorr_arm.s CELT_AM_SOURCES_ARM_ASM = \ celt/arm/armopts.s.in + +CELT_SOURCES_ARM_NEON_INTR = \ +celt/arm/celt_neon_intr.c diff --git a/configure.ac b/configure.ac index 9b2f51f..6ad4a70 100644 --- a/configure.ac +++ b/configure.ac @@ -190,14 +190,14 @@ AC_ARG_ENABLE([rtcd], [enable_rtcd=yes]) AC_ARG_ENABLE([intrinsics], - [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations (only for fixed point x86)])],, + [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations for ARM(float) X86(fixed)])],, [enable_intrinsics=no]) rtcd_support=no cpu_arm=no AS_IF([test x"${enable_asm}" = x"yes"],[ - inline_optimization="No ASM for your platform, please send patches" + inline_optimization="No in-line ASM for your platform, please send patches" case $host_cpu in arm*) dnl Currently we only have asm for fixed-point @@ -343,7 +343,6 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ asm_optimization="disabled" ]) -AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], [test x"${inline_optimization%% *}" = x"ARM"]) AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], @@ -351,9 +350,52 @@ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], AM_CONDITIONAL([HAVE_SSE4_1], [false]) AM_CONDITIONAL([HAVE_SSE2], [false]) + AS_IF([test x"$enable_intrinsics" = x"yes"],[ -AS_IF([test x"$enable_float" = x"no"], -[AS_IF([test x"$host_cpu" = x"i386" -o x"$host_cpu" = x"i686" -o x"$host_cpu" = x"x86_64"],[ + case $host_cpu in + arm*) + cpu_arm=yes + AC_MSG_CHECKING(if compiler supports arm neon intrinsics) + save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])], + [ + OPUS_ARM_NEON_INTR=1 + AC_MSG_RESULT([yes]) + ],[ + OPUS_ARM_NEON_INR=0 + AC_MSG_RESULT([no]) + ]) + CFLAGS="$save_CFLAGS" + #Now we know if compiler supports ARM neon intrinsics or not + + #Currently we only have intrinsic optimization for floating point + AS_IF([test x"$enable_float" = x"yes"], + [ + AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], + [ + OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3" + AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS) + AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics]) + AS_IF([test x"enable_rtcd" != x""], + [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[]) + enable_intrinsics="$enable_intrinsics ARMv7_Neon_Intrinsics" + dnl Don't see why defining these is necessary to check features at runtime + AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support EDSP Instructions]) + AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support MEDIA Instructions]) + AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions]) + ], + [ + AC_MSG_WARN([Compiler does not support ARM intrinsics]) + enable_intrinsics=no + ]) + ], [ + AC_MSG_WARN([Currently on have ARM intrinsics for float]) + enable_intrinsics=no + ]) + ;; + "i386" | "i686" | "x86_64") + AS_IF([test x"$enable_float" = x"no"],[ AS_IF([test x"$enable_rtcd" = x"yes"],[ get_cpuid_by_asm="no" AC_MSG_CHECKING([Get CPU Info]) @@ -423,7 +465,7 @@ AS_IF([test x"$enable_float" = x"no"], AM_CONDITIONAL([HAVE_SSE2], [true]) AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])]) - ],[ + ],[ ##### Else case for AS_IF([test x"$?" = x"0"]) gcc -Q --help=target | grep "\-msse2 " AC_MSG_CHECKING([sse2]) AS_IF([test x"$?" = x"0"],[ @@ -446,13 +488,28 @@ AS_IF([test x"$enable_float" = x"no"], AM_CONDITIONAL([HAVE_SSE2], [true]) AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])]) - ],[enable_intrinsics="no"]) + ],[enable_intrinsics="no"]) #End of AS_IF([test x"$?" = x"0"] ]) - ], [enable_intrinsics="no"]) -]) -], [enable_intrinsics="no"]) + ], [ + enable_intrinsics="no" + ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"] +], +[ ## Else case for AS_IF([test x"$enable_float" = x"no"] + AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed point]) + enable_intrinsics="no" +]) ## End of AS_IF([test x"$enable_float" = x"no"] + ;; + *) + AC_MSG_WARN([No intrinsics support for your architecture]) + enable_intrinsics="no" + ;; + esac ]) +AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) +AM_CONDITIONAL([OPUS_ARM_NEON_INTR], + [test x"$OPUS_ARM_NEON_INTR" = x"1"]) + AS_IF([test x"$enable_rtcd" = x"yes"],[ AS_IF([test x"$rtcd_support" != x"no"],[ AC_DEFINE([OPUS_HAVE_RTCD], [1], -- 1.7.9.5
Timothy B. Terriberry
2014-Dec-09 03:12 UTC
[opus] [RFC PATCH v2] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Viswanath Puttagunta wrote:> + SUMM = vdupq_n_f32(0);It kills me that there's no intrinsic for VMOV.F32 d0, #0 (or at least I couldn't find one), so this takes two instructions instead of one.> + /* Consume 4 elements in x vector and 8 elements in y > + * vector. However, the 8'th element in y never really gets > + * touched in this loop. So, if len == 4, then we only > + * must access y[0] to y[6]. y[7] must not be accessed > + * hence make sure len>4 and not len>=4 > + */ > + while (len > 4) {This can be an if() instead of a while (though it looks like gcc is smart enough to figure that out).> + /* Just unroll the rest of the loop */If you're not going to special case the last 2+1+1 samples, is there a measurable performance difference compared to simply looping?> + yi++; > + switch(len) { > + case 4: > + XX_2 = vld1_dup_f32(xi++); > + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); > + YY[0] = vld1q_f32(yi++); > + case 3: > + XX_2 = vld1_dup_f32(xi++); > + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); > + YY[0] = vld1q_f32(yi++); > + case 2: > + XX_2 = vld1_dup_f32(xi++); > + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); > + YY[0] = vld1q_f32(yi++); > + case 1: > + XX_2 = vld1_dup_f32(xi++); > + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); > + } > + > + vst1q_f32(sum, SUMM); > +} > + > +/* > + * Function: xcorr3to1_kernel_neon_float > + * --------------------------------- > + * Computes single correlation values and stores in *sum > + */ > +void xcorr3to1_kernel_neon_float(const float *x, const float *y, > + float *sum, int len) {I had to think quite a bit about what "3to1" meant (since it is describing the context of the caller, not what the actual function does). I'd follow the naming convention in the existing celt_pitch_xcorr_arm.s, and use "process1", personally.> + int i; > + float32x4_t XX[4]; > + float32x4_t YY[4]; > + float32x4_t SUMM; > + float32x2_t ZERO; > + float32x2x2_t tv; > + float sumi; > + float *xi = x; > + float *yi = y; > + > + ZERO = vdup_n_f32(0); > + SUMM = vdupq_n_f32(0); > + > + /* Work on 16 values per cycle */s/cycle/iteration/ (here and below). In performance-critical code when you say cycle I think machine cycle, and NEON definitely can't process 16 floats in one of those.> + while (len >= 16) {> + /* Accumulate results into single float */ > + tv.val[0] = vadd_f32(vget_low_f32(SUMM), vget_high_f32(SUMM)); > + tv = vtrn_f32(tv.val[0], ZERO); > + tv.val[0] = vadd_f32(tv.val[0], tv.val[1]); > + > + vst1_lane_f32(&sumi, tv.val[0], 0);Accessing tv.val[0] and tv.val[1] directly seems to send these values through the stack, e.g., f4: f3ba7085 vtrn.32 d7, d5 f8: ed0b7b0f vstr d7, [fp, #-60] fc: ed0b5b0d vstr d5, [fp, #-52] ... 114: ed1b6b09 vldr d6, [fp, #-36] 118: ed1b7b0b vldr d7, [fp, #-44] 11c: f2077d06 vadd.f32 d7, d7, d6 120: f483780f vst1.32 {d7[0]}, [r3] Can't you just use float32x2_t tv; tv = vadd_f32(vget_low_f32(SUMM), vget_high_f32(SUMM)); tv = vpadd_f32(tv, tv); (you can get rid of ZERO, then, too)> + for (i = 0; i < len; i++) > + sumi += xi[i] * yi[i]; > + *sum = sumi;This bounces things through the stack into vfp registers, which is a huge stall. I'd continue to use NEON here with vld1_dup_f32()/vmla_f32()/etc.> +}> +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ > + || defined(OPUS_ARM_NEON_INTR)) > #include "arm/arm_celt_map.c" > #endif > > +Unrelated whitespace change.> - inline_optimization="No ASM for your platform, please send patches" > + inline_optimization="No in-line ASM for your platform, please send patches""inline" is one word.> + AC_MSG_CHECKING(if compiler supports arm neon intrinsics)Capitalize ARM and NEON, please.> + save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" > + AC_COMPILE_IFELSE(Can we use AC_LINK_IFELSE? We had a problem where sometimes if SSE/AVX was not available, but the headers existed, the #include would succeed but no functions would get defined. My arm_neon.h seems to be written better than that, but I'd like to guard against other implementations having similar problems (and keep things consistent with the SSE tests).> + [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])],You also need to include a call to an actual NEON intrinsic here. If the function is not defined, even a call to it here will compile (with an implicit declaration warning), but linking will fail.> + [ > + OPUS_ARM_NEON_INTR=1 > + AC_MSG_RESULT([yes]) > + ],[ > + OPUS_ARM_NEON_INR=0OPUS_ARM_NEON_INTR (you're missing a 'T')> + AC_MSG_RESULT([no]) > + ]) > + CFLAGS="$save_CFLAGS" > + #Now we know if compiler supports ARM neon intrinsics or not > + > + #Currently we only have intrinsic optimization for floating point > + AS_IF([test x"$enable_float" = x"yes"], > + [ > + AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], > + [ > + OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3"I don't think you should change the optimization level here.
Possibly Parallel Threads
- [RFC PATCH v2] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [PATCH v1] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCH v3] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCH v2] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics