thr3ads.net - opus - [opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics [Nov 2014]

If this information is useful, please help other people find it:
Share via:

Viswanath Puttagunta

2014-Nov-21 23:38 UTC

[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

Hello,

I received feedback from engineers working on NE10 [1] that
it would be better to use NE10 [1] for FFT optimizations for
opus use cases. However, these FFT patches are currently in review
and haven't been integrated into NE10 yet.

While the FFT functions in NE10 are getting baked, I wanted
to optimize the celt_pitch_xcorr (floating point only) and use
it to introduce ARM NEON intrinsics into libopus project.

This will also lay the foundation when FFT routines in NE10
become available.

I would like to know the following as I DID NOT fully use-case
test this celt_pitch_xcorr optimization. Did only unit tests.

a. Simplest use case to validate this optimization for correctness.
b. Simplest use case to validate this optimization for performance.

Would prefer something like opusdec that can be executed on command
line.

Any other feedback welcome.

[1]: http://projectne10.github.io/Ne10/

Viswanath Puttagunta (1):
  armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

 Makefile.am                     |   23 +++++++++++
 celt/_kiss_fft_guts.h           |    4 +-
 celt/arch.h                     |    4 +-
 celt/arm/arm_celt_map.c         |   15 +++++++-
 celt/arm/celt_neon_intr.c       |   81 +++++++++++++++++++++++++++++++++++++++
 celt/arm/pitch_arm.h            |   15 +++++++-
 celt/pitch.h                    |   17 ++++++--
 celt/tests/test_unit_mathops.c  |    2 +-
 celt/tests/test_unit_rotation.c |    2 +-
 celt_sources.mk                 |    3 ++
 configure.ac                    |   56 ++++++++++++++++++++++-----
 silk/SigProc_FIX.h              |    4 +-
 silk/macros.h                   |    4 +-
 13 files changed, 205 insertions(+), 25 deletions(-)
 create mode 100644 celt/arm/celt_neon_intr.c

-- 
1.7.9.5

Viswanath Puttagunta

2014-Nov-21 23:38 UTC

head link

[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

Optimize celt_pitch_xcorr function (for floating point)
using ARM NEON intrinsics for SoCs that have NEON VFP unit.

As initial step, targeting ARMv7 NEON (VFP3+) based SoCs.
To enable this optimization, use --enable-arm-neon-intrinsics
configure option. This flag is not enabled by default.

Compile time and runtime checks are also supported to make sure
this optimization is only enabled when the compiler supports
neon intrinsics.
---
 Makefile.am                     |   23 +++++++++++
 celt/_kiss_fft_guts.h           |    4 +-
 celt/arch.h                     |    4 +-
 celt/arm/arm_celt_map.c         |   15 +++++++-
 celt/arm/celt_neon_intr.c       |   81 +++++++++++++++++++++++++++++++++++++++
 celt/arm/pitch_arm.h            |   15 +++++++-
 celt/pitch.h                    |   17 ++++++--
 celt/tests/test_unit_mathops.c  |    2 +-
 celt/tests/test_unit_rotation.c |    2 +-
 celt_sources.mk                 |    3 ++
 configure.ac                    |   56 ++++++++++++++++++++++-----
 silk/SigProc_FIX.h              |    4 +-
 silk/macros.h                   |    4 +-
 13 files changed, 205 insertions(+), 25 deletions(-)
 create mode 100644 celt/arm/celt_neon_intr.c

diff --git a/Makefile.am b/Makefile.am
index e20f7b4..0e9e120 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,6 +38,12 @@ CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
 endif
 
+if OPUS_ARM_NEON_INTR
+noinst_LTLIBRARIES = libarmneon.la
+libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR)
+libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS) -I$(top_srcdir)/include
+endif
+
 if CPU_ARM
 CELT_SOURCES += $(CELT_SOURCES_ARM)
 SILK_SOURCES += $(SILK_SOURCES_ARM)
@@ -59,6 +65,9 @@ include opus_headers.mk
 libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
 libopus_la_LDFLAGS = -no-undefined -version-info
@OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
 libopus_la_LIBADD = $(LIBM)
+if OPUS_ARM_NEON_INTR
+libopus_la_LIBADD += ./libarmneon.la
+endif
 
 pkginclude_HEADERS = include/opus.h include/opus_multistream.h
include/opus_types.h include/opus_defines.h
 
@@ -97,6 +106,11 @@ celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
 celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
 celt_tests_test_unit_dft_LDADD = $(LIBM)
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_dft_LDADD += ./libarmneon.la
+endif
+
+
 
 celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
 celt_tests_test_unit_entropy_LDADD = $(LIBM)
@@ -110,10 +124,16 @@ if CPU_ARM
 if OPUS_ARM_EXTERNAL_ASM
 celt_tests_test_unit_mathops_LDADD += libopus.la
 endif
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_mathops_LDADD += ./libarmneon.la
+endif
 endif
 
 celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
 celt_tests_test_unit_mdct_LDADD = $(LIBM)
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_mdct_LDADD += ./libarmneon.la
+endif
 
 celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
 celt_tests_test_unit_rotation_LDADD = $(LIBM)
@@ -121,6 +141,9 @@ if CPU_ARM
 if OPUS_ARM_EXTERNAL_ASM
 celt_tests_test_unit_rotation_LDADD += libopus.la
 endif
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_rotation_LDADD += ./libarmneon.la
+endif
 endif
 
 celt_tests_test_unit_types_SOURCES = celt/tests/test_unit_types.c
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 5e3d58f..11a2676 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -90,11 +90,11 @@
     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
     }while(0)
 
-#if defined(OPUS_ARM_INLINE_ASM)
+#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/kiss_fft_armv4.h"
 #endif
 
-#if defined(OPUS_ARM_INLINE_EDSP)
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/kiss_fft_armv5e.h"
 #endif
 #if defined(MIPSr1_ASM)
diff --git a/celt/arch.h b/celt/arch.h
index 9f74ddd..0e3061c 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -118,9 +118,9 @@ static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
 
 #include "fixed_generic.h"
 
-#ifdef OPUS_ARM_INLINE_EDSP
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/fixed_armv5e.h"
-#elif defined (OPUS_ARM_INLINE_ASM)
+#elif defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/fixed_armv4.h"
 #elif defined (BFIN_ASM)
 #include "fixed_bfin.h"
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 547a84d..a6313e7 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -42,8 +42,19 @@ opus_val32 (*const
CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
 };
 # else
-#  error "Floating-point implementation is not supported by ARM asm
yet." \
- "Reconfigure with --disable-rtcd or send patches."
+
+void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int , int, int) = {
+  celt_pitch_xcorr_c,               	/* ARMv4 */
+  celt_pitch_xcorr_c,  		 	/* EDSP */
+  celt_pitch_xcorr_c,			/* Media */
+#if defined(OPUS_ARM_NEON_INTR)
+  celt_pitch_xcorr_float_neon		 /* Neon */
+#else
+  celt_pitch_xcorr_c
+#endif
+};
+
 # endif
 
 #endif
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
new file mode 100644
index 0000000..88954fb
--- /dev/null
+++ b/celt/arm/celt_neon_intr.c
@@ -0,0 +1,81 @@
+#include <arm_neon.h>
+#include "../arch.h"
+
+static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int len)
{
+	float32x4_t YY[5];
+	float32x4_t XX[4];
+	float32x2_t XX_2;
+	float32x4_t SUMM[4];
+	float *xi = x;
+	float *yi = y;
+	int cd = len/4;
+	int cr = len%4;
+	int j;
+
+	celt_assert(len>=3);
+
+	/* Initialize sums to 0 */
+	SUMM[0] = vdupq_n_f32(0);
+	SUMM[1] = vdupq_n_f32(0);
+	SUMM[2] = vdupq_n_f32(0);
+	SUMM[3] = vdupq_n_f32(0);
+
+	YY[0] = vld1q_f32(yi);
+
+	/* Each loop consumes 8 floats in y vector
+	 * and 4 floats in x vector
+	 */
+	for (j = 0; j < cd; j++) {
+		yi += 4;
+		YY[4] = vld1q_f32(yi);
+		YY[1] = vextq_f32(YY[0], YY[4], 1);
+		YY[2] = vextq_f32(YY[0], YY[4], 2);
+		YY[3] = vextq_f32(YY[0], YY[4], 3);
+
+		XX[0] = vld1q_dup_f32(xi++);
+		XX[1] = vld1q_dup_f32(xi++);
+		XX[2] = vld1q_dup_f32(xi++);
+		XX[3] = vld1q_dup_f32(xi++);
+
+		SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]);
+		SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]);
+		SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]);
+		SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]);
+		YY[0] = YY[4];
+	}
+
+	/* Handle remaining values max iterations = 3 */
+	for (j = 0; j < cr; j++) {
+		YY[0] = vld1q_f32(yi++);
+		XX_2 = vld1_lane_f32(xi++, XX_2, 0);
+		SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0);
+	}
+
+	SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]);
+	SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]);
+	SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]);
+
+	vst1q_f32(sum, SUMM[0]);
+}
+
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+			opus_val32 *xcorr, int len, int max_pitch, int arch) {
+	int i, j;
+
+	celt_assert(max_pitch > 0);
+	celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+
+	for (i = 0; i < (max_pitch-3); i += 4) {
+		xcorr_kernel_neon_float((float *)_x, (float *)_y+i,
+					(float *)xcorr+i, len);
+	}
+
+	/* In case max_pitch isn't multiple of 4, do unrolled version */
+	for (; i < max_pitch; i++) {
+		float sum = 0;
+		float *yi = _y+i;
+		for (j = 0; j < len; j++)
+			sum += _x[i]*yi[i];
+		xcorr[i] = sum;
+	}
+}
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index a07f8ac..f5adc48 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const
opus_val16 *_y,
   ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
 #  endif
 
-# endif
+#else /* Start !FIXED_POINT */
+/* Float case */
+#if defined(OPUS_ARM_NEON_INTR)
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+				opus_val32 *xcorr, int len, int max_pitch, int arch);
+#endif
+
+#if !defined(OPUS_HAVE_RTCD) && defined(OPUS_PRESUME_ARM_NEON_INTR)
+#define OVERRIDE_PITCH_XCORR (1)
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+	((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \
+						len, max_pitch, arch))
+#endif
+#endif	/*end of !FIXED_POINT*/
 
 #endif
diff --git a/celt/pitch.h b/celt/pitch.h
index 027ebd9..752b54e 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,7 +46,7 @@
 #include "mips/pitch_mipsr1.h"
 #endif
 
-#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#if defined(OPUS_ARM_ASM)
 # include "arm/pitch_arm.h"
 #endif
 
@@ -189,11 +189,22 @@ opus_val32
 void
 #  endif
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int
+#if !defined(FIXED_POINT)
+	,int
+#endif
+);
 
+#if defined(FIXED_POINT)
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch)
+#else
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
+#endif
+
 # else
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch))
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 3076bbf..cdb2939 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -56,7 +56,7 @@
 #include "x86/celt_lpc_sse.c"
 #endif
 #include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#elif defined(OPUS_ARM_ASM)
 #include "arm/arm_celt_map.c"
 #endif
 
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 37ba74e..906fa7e 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -54,7 +54,7 @@
 #include "x86/celt_lpc_sse.c"
 #endif
 #include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#elif defined(OPUS_ARM_ASM)
 #include "arm/arm_celt_map.c"
 #endif
 
diff --git a/celt_sources.mk b/celt_sources.mk
index 20b1b1b..3d4deca 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -30,5 +30,8 @@ celt/arm/arm_celt_map.c
 CELT_SOURCES_ARM_ASM = \
 celt/arm/celt_pitch_xcorr_arm.s
 
+CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_neon_intr.c
+
 CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
diff --git a/configure.ac b/configure.ac
index 9b2f51f..09657b6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -198,12 +198,11 @@ cpu_arm=no
 
 AS_IF([test x"${enable_asm}" = x"yes"],[
     inline_optimization="No ASM for your platform, please send
patches"
+    OPUS_ARM_NEON_INTR_CPPFLAGS     case $host_cpu in
       arm*)
-        dnl Currently we only have asm for fixed-point
-        AS_IF([test "$enable_float" != "yes"],[
             cpu_arm=yes
-            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm optimization])
+            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm/intrinsic
optimization])
             AS_GCC_INLINE_ASSEMBLY(
                 [inline_optimization="ARM"],
                 [inline_optimization="disabled"]
@@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" =
x"yes"],[
             AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],
                 [OPUS_ARM_INLINE_MEDIA=0])
             AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])
+
+	    AC_ARG_ENABLE([arm-neon-intrinsics],
+		AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON optimisations on
ARM CPUs that support it]))
+
+	    AS_IF([test x"$enable_arm_neon_intrinsics" = x"yes"],
+		[
+	        AC_MSG_CHECKING(if compiler supports arm neon intrinsics)
+		save_CFLAGS="$CFLAGS"
+		save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
+		AC_COMPILE_IFELSE(
+		    [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])],
+		    [
+			OPUS_ARM_NEON_INTR=1
+			OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3"
+			AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS)
+		    ],
+		    [
+			OPUS_ARM_NEON_INTR=0
+		    ])
+		CFLAGS="$save_CFLAGS"
+		AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"],
+		    [AC_MSG_RESULT([yes])],
+		    [AC_MSG_RESULT([no])])
+		],
+		[
+		    OPUS_ARM_NEON_INTR=0
+		    AC_MSG_WARN([ARMv7 neon intrinsics not enabled])
+		])
+
             AS_IF([test x"$inline_optimization" = x"ARM"],[
                 AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])
                 AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,
@@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
                     AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],
                         [Use ARMv5E inline asm optimizations])
                     inline_optimization="$inline_optimization (EDSP)"
-                ])
+                ]n)
                 AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" =
x"1"],[
                     AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],
                         [Use ARMv6 inline asm optimizations])
@@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" =
x"yes"],[
                   [*** ARM assembly requires perl -- disabling optimizations])
                 asm_optimization="(missing perl dependency for ARM)"
             ])
-        ])
+	   AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [
+		AC_DEFINE([OPUS_ARM_NEON_INTR], 1,
+			[Compiler supports ARMv7 Neon Intrinsics]),
+		AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [
+			AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1,
+			    [Compiler support arm Intrinsics and target must support neon])],
+			[])
+		AS_IF([test x"enable_rtcd" != x""],
+			[rtcd_support="$rtcd_support (NEON_INTR)"],
+			[])
+		],[])
         ;;
     esac
-],[
-   inline_optimization="disabled"
-   asm_optimization="disabled"
-])
+],[])
 
 AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
 AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
@@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
 AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
     [test x"${asm_optimization%% *}" = x"ARM"])
 
+AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
+    [test x"$OPUS_ARM_NEON_INTR" = x"1"])
+
 AM_CONDITIONAL([HAVE_SSE4_1], [false])
 AM_CONDITIONAL([HAVE_SSE2], [false])
 AS_IF([test x"$enable_intrinsics" = x"yes"],[
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index b632994..4c03a33 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -595,11 +595,11 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a,
opus_int64 b)
 #include "MacroCount.h"
 #include "MacroDebug.h"
 
-#ifdef OPUS_ARM_INLINE_ASM
+#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/SigProc_FIX_armv4.h"
 #endif
 
-#ifdef OPUS_ARM_INLINE_EDSP
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/SigProc_FIX_armv5e.h"
 #endif
 
diff --git a/silk/macros.h b/silk/macros.h
index 2f24950..217b851 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -138,11 +138,11 @@ static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32)
     (*((Matrix_base_adr) + ((row)+(M)*(column))))
 #endif
 
-#ifdef OPUS_ARM_INLINE_ASM
+#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/macros_armv4.h"
 #endif
 
-#ifdef OPUS_ARM_INLINE_EDSP
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/macros_armv5e.h"
 #endif
 
-- 
1.7.9.5

Timothy B. Terriberry

2014-Nov-22 00:06 UTC

head link

[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

Viswanath Puttagunta wrote:> a. Simplest use case to validate this optimization for correctness.
> b. Simplest use case to validate this optimization for performance.
>
> Would prefer something like opusdec that can be executed on command
> line.
The easiest thing to use is probably opus_demo (opusdec does a bunch of 
extra things, plus for interactive use we care about both the encoder 
and decoder, and celt_pitch_xcorr gets used vastly more by the encoder 
than the decoder... I think the decoder only uses it for PLC).

Something like
./opus_demo restricted-lowdelay 48000 2 96000 comp48-stereo.sw /dev/null

comp48-stereo.sw can be found here: 
https://people.xiph.org/~tterribe/opus/comp48-stereo.sw

celt_pitch_xcorr also gets used by the SILK encoder (more in fixed-point 
than float, but the float one uses it, too). So it may be worth doing a 
run with the application set to voip instead of restricted-lowdelay and 
a lower bitrate (e.g., 24000 instead of 96000).

Even though this primarily affects the encoder, as a sanity check, it's 
always good to make sure the test vectors still decode correctly. Get 
them from <http://opus-codec.org/testvectors/opus_testvectors.tar.gz> 
and use
tests/run_vectors.sh <build path> <test vectors path> 48000

Viswanath Puttagunta

2014-Nov-24 20:53 UTC

head link

[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

On 21 November 2014 at 18:06, Timothy B. Terriberry <tterribe at xiph.org>
wrote:>
> Viswanath Puttagunta wrote:
>>
>> a. Simplest use case to validate this optimization for correctness.
>> b. Simplest use case to validate this optimization for performance.
>>
>> Would prefer something like opusdec that can be executed on command
>> line.
>
>
> The easiest thing to use is probably opus_demo (opusdec does a bunch of
extra things, plus for interactive use we care about both the encoder and
decoder, and celt_pitch_xcorr gets used vastly more by the encoder than the
decoder... I think the decoder only uses it for PLC).
>
> Something like
> ./opus_demo restricted-lowdelay 48000 2 96000 comp48-stereo.sw /dev/null
>
> comp48-stereo.sw can be found here:
https://people.xiph.org/~tterribe/opus/comp48-stereo.sw
>
> celt_pitch_xcorr also gets used by the SILK encoder (more in fixed-point
than float, but the float one uses it, too). So it may be worth doing a run with
the application set to voip instead of restricted-lowdelay and a lower bitrate
(e.g., 24000 instead of 96000).
Thanks for your feedback. I have verified both above cases. While I used
./opus_demo restricted-lowdelay 48000 2 96000 comp48-stereo.sw out.wav,
./opus_demo voip 48000 2 24000 comp48-stereo.sw out.wav

to make sure the output out.wav is clearly audible, I used below
command (encode only) for performance benchmarking.

./opus_demo -e restricted-lowdelay 48000 2 96000 comp48-stereo.sw opus_raw.out
./opus_demo -e voip 48000 2 96000 comp48-stereo.sw opus_raw.out

I saw much better improvement in performance  (16.16%) for overall
encode use case for "restricted-lowdelay 48000 2 96000" for CELT
encoding as you suspected as celt_pitch_xcorr function gets used much
more.

I observed lesser improvement in performance (3.42%) for overall
encode use case for "voip 48000 2 24000". This is somewhat expected as
cel_pitch_xcorr_c was not the main contributor for performance in this
SILK encoder use case.

For detailed information on how I measured performance on my
Beaglebone Black (Cortex-A8), please see "celt_pitch_xcorr (float)
Neon Optimization" section of [1]

[1]:
https://docs.google.com/document/d/1L6csATjSsXtzg_sa1iHZta8hOsoVWA4UjHXEakpTrNk/edit?usp=sharing


>
> Even though this primarily affects the encoder, as a sanity check, it's
always good to make sure the test vectors still decode correctly. Get them from
<http://opus-codec.org/testvectors/opus_testvectors.tar.gz> and use
> tests/run_vectors.sh <build path> <test vectors path> 48000

Timothy B. Terriberry

2014-Nov-28 21:52 UTC

head link

[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

Review comments inline.
> +if OPUS_ARM_NEON_INTR
> +noinst_LTLIBRARIES = libarmneon.la
> +libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR)
> +libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS)
-I$(top_srcdir)/include
> +endif
I don't think these should be in a separate library. It brings with it 
lots of complications (to name one: wouldn't the .pc files need to be 
updated?). Please use the same mechanism that the SSE intrinsics use to 
add CFLAGS to the compilation of specific object files, e.g.,

$(SSE_OBJ): CFLAGS += -msse4.1
> +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
> +    const opus_val16 *, opus_val32 *, int , int, int) = {
> +  celt_pitch_xcorr_c,               	/* ARMv4 */
> +  celt_pitch_xcorr_c,  		 	/* EDSP */
> +  celt_pitch_xcorr_c,			/* Media */
> +#if defined(OPUS_ARM_NEON_INTR)
> +  celt_pitch_xcorr_float_neon		 /* Neon */
Please do not use tabs in source code (this applies here and everywhere 
below). Even with the tabs expanded in context, the comments here do not 
line up properly.
> +static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int
len) {
x and y should be const.
> +	float32x4_t YY[5];
> +	float32x4_t XX[4];
> +	float32x2_t XX_2;
> +	float32x4_t SUMM[4];
> +	float *xi = x;
> +	float *yi = y;
> +	int cd = len/4;
> +	int cr = len%4;
len is signed, so / and % are NOT equivalent to the corresponding >> and 
& (they are much slower).
> +	int j;
> +
> +	celt_assert(len>=3);
> +
> +	/* Initialize sums to 0 */
> +	SUMM[0] = vdupq_n_f32(0);
> +	SUMM[1] = vdupq_n_f32(0);
> +	SUMM[2] = vdupq_n_f32(0);
> +	SUMM[3] = vdupq_n_f32(0);
> +
> +	YY[0] = vld1q_f32(yi);
> +
> +	/* Each loop consumes 8 floats in y vector
> +	 * and 4 floats in x vector
> +	 */
> +	for (j = 0; j < cd; j++) {
> +		yi += 4;
> +		YY[4] = vld1q_f32(yi);
If len == 4, then in the first iteration you will have loaded 8 y 
values, but only 7 are guaranteed to be available (e.g., the C code only 
references y[0] up to y[len-1+3]). You need to end this loop early and 
fall back to another approach. See comments in celt_pitch_xcorr_arm.s 
for details and an example (there are other useful comments there that 
could shave another cycle or two from this inner loop).
> +		YY[1] = vextq_f32(YY[0], YY[4], 1);
> +		YY[2] = vextq_f32(YY[0], YY[4], 2);
> +		YY[3] = vextq_f32(YY[0], YY[4], 3);
> +
> +		XX[0] = vld1q_dup_f32(xi++);
> +		XX[1] = vld1q_dup_f32(xi++);
> +		XX[2] = vld1q_dup_f32(xi++);
> +		XX[3] = vld1q_dup_f32(xi++);
Don't do this. Do a single load and use vmlaq_lane_f32() to multiply by 
each value. That should cut at least 5 cycles out of this loop.
> +
> +		SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]);
> +		SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]);
> +		SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]);
> +		SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]);
> +		YY[0] = YY[4];
> +	}
> +
> +	/* Handle remaining values max iterations = 3 */
> +	for (j = 0; j < cr; j++) {
> +		YY[0] = vld1q_f32(yi++);
This load is always redundant in the first iteration, which is a bit 
unfortunate.
> +		XX_2 = vld1_lane_f32(xi++, XX_2, 0);
Don't load a single lane when you don't need the value(s) in the other 
lane(s). Use vld1_dup_f32() instead. It's faster and breaks dependencies.
> +		SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0);
> +	}
> +
> +	SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]);
> +	SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]);
> +	SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]);
> +
> +	vst1q_f32(sum, SUMM[0]);
> +}
> +
> +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16
*_y,
> +			opus_val32 *xcorr, int len, int max_pitch, int arch) {
arch is unused. There's no reason to pass it here. If we're here, we 
know what the arch is.
> +	int i, j;
> +
> +	celt_assert(max_pitch > 0);
> +	celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
> +
> +	for (i = 0; i < (max_pitch-3); i += 4) {
> +		xcorr_kernel_neon_float((float *)_x, (float *)_y+i,
> +					(float *)xcorr+i, len);
> +	}
> +
> +	/* In case max_pitch isn't multiple of 4, do unrolled version */
> +	for (; i < max_pitch; i++) {
> +		float sum = 0;
> +		float *yi = _y+i;
> +		for (j = 0; j < len; j++)
> +			sum += _x[i]*yi[i];
> +		xcorr[i] = sum;
> +	}
This loop can still be largely vectorized. Any reason not to do so?
> +}
> diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
> index a07f8ac..f5adc48 100644
> --- a/celt/arm/pitch_arm.h
> +++ b/celt/arm/pitch_arm.h
> @@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x,
const opus_val16 *_y,
>     ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len,
max_pitch))
>   #  endif
>
> -# endif
> +#else /* Start !FIXED_POINT */
> +/* Float case */
> +#if defined(OPUS_ARM_NEON_INTR)
> +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16
*_y,
> +				opus_val32 *xcorr, int len, int max_pitch, int arch);
> +#endif
> +
> +#if !defined(OPUS_HAVE_RTCD) &&
defined(OPUS_PRESUME_ARM_NEON_INTR)
> +#define OVERRIDE_PITCH_XCORR (1)
> +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> +	((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \
> +						len, max_pitch, arch))
Again, don't pass arch.
>   (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
> -      const opus_val16 *, opus_val32 *, int, int);
> +      const opus_val16 *, opus_val32 *, int, int
> +#if !defined(FIXED_POINT)
> +	,int
> +#endif
> +);
Which gets rid of this ugliness.
> +#if defined(FIXED_POINT)
> +#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> +  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
> +        xcorr, len, max_pitch)
> +#else
>   #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
>     ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
> -        xcorr, len, max_pitch))
> +        xcorr, len, max_pitch, arch))
> +#endif
> +
And this.
> diff --git a/configure.ac b/configure.ac
> index 9b2f51f..09657b6 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -198,12 +198,11 @@ cpu_arm=no
>
>   AS_IF([test x"${enable_asm}" = x"yes"],[
>       inline_optimization="No ASM for your platform, please send
patches"
> +    OPUS_ARM_NEON_INTR_CPPFLAGS>       case $host_cpu in
>         arm*)
> -        dnl Currently we only have asm for fixed-point
> -        AS_IF([test "$enable_float" != "yes"],[
>               cpu_arm=yes
> -            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm
optimization])
> +            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm/intrinsic
optimization])
Not sure I'm in love with conflating intrinsics with inline assembly. 
For example, are these tests (especially the PRESUME_NEON stuff) going 
to do the right thing on aarch64?
>               AS_GCC_INLINE_ASSEMBLY(
>                   [inline_optimization="ARM"],
>                   [inline_optimization="disabled"]
> @@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" =
x"yes"],[
>               AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],
>                   [OPUS_ARM_INLINE_MEDIA=0])
>              
AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])
> +
> +	    AC_ARG_ENABLE([arm-neon-intrinsics],
> +		AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON
optimisations on ARM CPUs that support it]))
This should specify a default value for enable_arm_neon_intrinsics. 
However, I really think this switch should be unified with the 
--enable-intrinsics switch currently used by x86.
> +
> +	    AS_IF([test x"$enable_arm_neon_intrinsics" =
x"yes"],
> +		[
> +	        AC_MSG_CHECKING(if compiler supports arm neon intrinsics)
> +		save_CFLAGS="$CFLAGS"
> +		save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
> +		AC_COMPILE_IFELSE(
> +		    [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])],
> +		    [
> +			OPUS_ARM_NEON_INTR=1
> +			OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3"
> +			AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS)
> +		    ],
> +		    [
> +			OPUS_ARM_NEON_INTR=0
> +		    ])
> +		CFLAGS="$save_CFLAGS"
> +		AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"],
> +		    [AC_MSG_RESULT([yes])],
> +		    [AC_MSG_RESULT([no])])
> +		],
> +		[
> +		    OPUS_ARM_NEON_INTR=0
> +		    AC_MSG_WARN([ARMv7 neon intrinsics not enabled])
> +		])
> +
>               AS_IF([test x"$inline_optimization" =
x"ARM"],[
>                   AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])
>                   AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,
> @@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" =
x"yes"],[
>                       AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],
>                           [Use ARMv5E inline asm optimizations])
>                       inline_optimization="$inline_optimization
(EDSP)"
> -                ])
> +                ]n)
Buh?
>                   AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" =
x"1"],[
>                       AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],
>                           [Use ARMv6 inline asm optimizations])
> @@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" =
x"yes"],[
>                     [*** ARM assembly requires perl -- disabling
optimizations])
>                   asm_optimization="(missing perl dependency for
ARM)"
>               ])
> -        ])
> +	   AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [
> +		AC_DEFINE([OPUS_ARM_NEON_INTR], 1,
> +			[Compiler supports ARMv7 Neon Intrinsics]),
> +		AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [
> +			AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1,
> +			    [Compiler support arm Intrinsics and target must support neon])],
> +			[])
> +		AS_IF([test x"enable_rtcd" != x""],
> +			[rtcd_support="$rtcd_support (NEON_INTR)"],
> +			[])
> +		],[])
>           ;;
>       esac
> -],[
> -   inline_optimization="disabled"
> -   asm_optimization="disabled"
> -])
> +],[])
>
>   AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
>   AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
> @@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
>   AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
>       [test x"${asm_optimization%% *}" = x"ARM"])
>
> +AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
> +    [test x"$OPUS_ARM_NEON_INTR" = x"1"])
> +
>   AM_CONDITIONAL([HAVE_SSE4_1], [false])
>   AM_CONDITIONAL([HAVE_SSE2], [false])
>   AS_IF([test x"$enable_intrinsics" = x"yes"],[

Maybe Matching Threads

Search for more possibly parallel threads

opus - Nov 2014 - [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

[opus] [RFC PATCHv1] cover: celt_pitch_xcorr: Introduce ARM neon intrinsics

[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

Maybe Matching Threads