Viswanath Puttagunta
2014-Dec-19 23:25 UTC
[opus] [PATCH v1] cover: armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Hi,
Optimizes celt_pitch_xcorr for ARM NEON floating point.
Changes from RFCv3:
- celt_neon_intr.c
- removed warnings due to not having constant pointers
- Put simpler loop to take care of corner cases. Unrolling using
intrinsics was not really mapping well to what was done
in celt_pitch_xcorr_arm.s
- Makefile.am
Removed explicit -O3 optimization
- test_unit_mathops.c, test_unit_rotation.c
followed recommendation to use #if #elif to guarantee that
only one of "arm/arm_celt_map.c" or "x86/x86_celt_map.c"
is included
Viswanath Puttagunta (1):
armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Makefile.am | 12 ++
celt/arm/arm_celt_map.c | 15 ++-
celt/arm/celt_neon_intr.c | 249 +++++++++++++++++++++++++++++++++++++++
celt/arm/pitch_arm.h | 13 +-
celt/cpu_support.h | 3 +-
celt/pitch.h | 6 +-
celt/tests/test_unit_mathops.c | 6 +-
celt/tests/test_unit_rotation.c | 6 +-
celt_sources.mk | 3 +
configure.ac | 83 +++++++++++--
10 files changed, 377 insertions(+), 19 deletions(-)
create mode 100644 celt/arm/celt_neon_intr.c
--
1.7.9.5
Viswanath Puttagunta
2014-Dec-19 23:25 UTC
[opus] [PATCH v1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Optimize celt_pitch_xcorr function (for floating point)
using ARM NEON intrinsics for SoCs that have NEON VFP unit.
To enable this optimization, use --enable-intrinsics
configure option.
Compile time and runtime checks are also supported to make sure
this optimization is only enabled when the compiler supports
neon intrinsics.
---
Makefile.am | 12 ++
celt/arm/arm_celt_map.c | 15 ++-
celt/arm/celt_neon_intr.c | 249 +++++++++++++++++++++++++++++++++++++++
celt/arm/pitch_arm.h | 13 +-
celt/cpu_support.h | 3 +-
celt/pitch.h | 6 +-
celt/tests/test_unit_mathops.c | 6 +-
celt/tests/test_unit_rotation.c | 6 +-
celt_sources.mk | 3 +
configure.ac | 83 +++++++++++--
10 files changed, 377 insertions(+), 19 deletions(-)
create mode 100644 celt/arm/celt_neon_intr.c
diff --git a/Makefile.am b/Makefile.am
index e20f7b4..95323ca 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -41,6 +41,12 @@ endif
if CPU_ARM
CELT_SOURCES += $(CELT_SOURCES_ARM)
SILK_SOURCES += $(SILK_SOURCES_ARM)
+
+if OPUS_ARM_NEON_INTR
+CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
+OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon
+endif
+
if OPUS_ARM_EXTERNAL_ASM
nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \
@@ -260,3 +266,9 @@ if HAVE_SSE2
$(SSE_OBJ): CFLAGS += -msse2
endif
endif
+
+if OPUS_ARM_NEON_INTR
+CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
+ %test_unit_rotation.o %test_unit_mathops.o
+$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS)
+endif
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 547a84d..ecdf7ec 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -41,9 +41,18 @@ opus_val32 (*const
CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
};
-# else
-# error "Floating-point implementation is not supported by ARM asm
yet." \
- "Reconfigure with --disable-rtcd or send patches."
+# else /* !FIXED_POINT */
+void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+ const opus_val16 *, opus_val32 *, int, int) = {
+ celt_pitch_xcorr_c, /* ARMv4 */
+ celt_pitch_xcorr_c, /* EDSP */
+ celt_pitch_xcorr_c, /* Media */
+#if defined(OPUS_ARM_NEON_INTR)
+ celt_pitch_xcorr_float_neon /* Neon */
+#else
+ celt_pitch_xcorr_c /* Neon */
+#endif
+};
# endif
#endif
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
new file mode 100644
index 0000000..1787ae1
--- /dev/null
+++ b/celt/arm/celt_neon_intr.c
@@ -0,0 +1,249 @@
+/* Copyright (c) 2014-2015 Xiph.Org Foundation
+ Written by Viswanath Puttagunta */
+/**
+ @file celt_neon_intr.c
+ @brief ARM Neon Intrinsic optimizations for celt
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <arm_neon.h>
+#include "../arch.h"
+
+/*
+ * Function: xcorr_kernel_neon_float
+ * ---------------------------------
+ * Computes 4 correlation values and stores them in sum[4]
+ */
+static void xcorr_kernel_neon_float(const float *x, const float *y,
+ float sum[4], int len) {
+ float32x4_t YY[3];
+ float32x4_t YEXT[3];
+ float32x4_t XX[2];
+ float32x2_t XX_2;
+ float32x4_t SUMM;
+ const float *xi = x;
+ const float *yi = y;
+
+ celt_assert(len>0);
+
+ YY[0] = vld1q_f32(yi);
+ SUMM = vdupq_n_f32(0);
+
+ /* Consume 8 elements in x vector and 12 elements in y
+ * vector. However, the 12'th element never really gets
+ * touched in this loop. So, if len == 8, then we only
+ * must access y[0] to y[10]. y[11] must not be accessed
+ * hence make sure len > 8 and not len >= 8
+ */
+ while (len > 8) {
+ yi += 4;
+ YY[1] = vld1q_f32(yi);
+ yi += 4;
+ YY[2] = vld1q_f32(yi);
+
+ XX[0] = vld1q_f32(xi);
+ xi += 4;
+ XX[1] = vld1q_f32(xi);
+ xi += 4;
+
+ SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
+ YEXT[0] = vextq_f32(YY[0], YY[1], 1);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
+ YEXT[1] = vextq_f32(YY[0], YY[1], 2);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
+ YEXT[2] = vextq_f32(YY[0], YY[1], 3);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
+
+ SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
+ YEXT[0] = vextq_f32(YY[1], YY[2], 1);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
+ YEXT[1] = vextq_f32(YY[1], YY[2], 2);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
+ YEXT[2] = vextq_f32(YY[1], YY[2], 3);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
+
+ YY[0] = YY[2];
+ len -= 8;
+ }
+
+ /* Consume 4 elements in x vector and 8 elements in y
+ * vector. However, the 8'th element in y never really gets
+ * touched in this loop. So, if len == 4, then we only
+ * must access y[0] to y[6]. y[7] must not be accessed
+ * hence make sure len>4 and not len>=4
+ */
+ if (len > 4) {
+ yi += 4;
+ YY[1] = vld1q_f32(yi);
+
+ XX[0] = vld1q_f32(xi);
+ xi += 4;
+
+ SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
+ YEXT[0] = vextq_f32(YY[0], YY[1], 1);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
+ YEXT[1] = vextq_f32(YY[0], YY[1], 2);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
+ YEXT[2] = vextq_f32(YY[0], YY[1], 3);
+ SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
+
+ YY[0] = YY[1];
+ len -= 4;
+ }
+
+ yi++;
+ while (len > 1) {
+ XX_2 = vld1_dup_f32(xi++);
+ SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
+ YY[0]= vld1q_f32(yi++);
+ len--;
+ }
+
+ if (len > 0) {
+ XX_2 = vld1_dup_f32(xi);
+ SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
+ }
+
+ vst1q_f32(sum, SUMM);
+}
+
+/*
+ * Function: xcorr_kernel_neon_float_process1
+ * ---------------------------------
+ * Computes single correlation values and stores in *sum
+ */
+static void xcorr_kernel_neon_float_process1(const float *x, const float *y,
+ float *sum, int len) {
+ float32x4_t XX[4];
+ float32x4_t YY[4];
+ float32x2_t XX_2;
+ float32x2_t YY_2;
+ float32x4_t SUMM;
+ float32x2_t SUMM_2[2];
+ const float *xi = x;
+ const float *yi = y;
+
+ SUMM = vdupq_n_f32(0);
+
+ /* Work on 16 values per iteration */
+ while (len >= 16) {
+ XX[0] = vld1q_f32(xi);
+ xi += 4;
+ XX[1] = vld1q_f32(xi);
+ xi += 4;
+ XX[2] = vld1q_f32(xi);
+ xi += 4;
+ XX[3] = vld1q_f32(xi);
+ xi += 4;
+
+ YY[0] = vld1q_f32(yi);
+ yi += 4;
+ YY[1] = vld1q_f32(yi);
+ yi += 4;
+ YY[2] = vld1q_f32(yi);
+ yi += 4;
+ YY[3] = vld1q_f32(yi);
+ yi += 4;
+
+ SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
+ SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
+ SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
+ SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
+ len -= 16;
+ }
+
+ /* Work on 8 values */
+ if (len >= 8) {
+ XX[0] = vld1q_f32(xi);
+ xi += 4;
+ XX[1] = vld1q_f32(xi);
+ xi += 4;
+
+ YY[0] = vld1q_f32(yi);
+ yi += 4;
+ YY[1] = vld1q_f32(yi);
+ yi += 4;
+
+ SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
+ SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
+ len -= 8;
+ }
+
+ /* Work on 4 values */
+ if (len >= 4) {
+ XX[0] = vld1q_f32(xi);
+ xi += 4;
+ YY[0] = vld1q_f32(yi);
+ yi += 4;
+ SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
+ len -= 4;
+ }
+
+ /* Start accumulating results */
+ SUMM_2[0] = vget_low_f32(SUMM);
+ if (len >= 2) {
+ /* While at it, consume 2 more values if available */
+ XX_2 = vld1_f32(xi);
+ xi += 2;
+ YY_2 = vld1_f32(yi);
+ yi += 2;
+ SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
+ len -= 2;
+ }
+ SUMM_2[1] = vget_high_f32(SUMM);
+ SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
+ SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
+ /* Ok, now we have result accumulated in SUMM_2[0].0 */
+
+ if (len > 0) {
+ /* Case when you have one value left */
+ XX_2 = vld1_dup_f32(xi);
+ YY_2 = vld1_dup_f32(yi);
+ SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
+ }
+
+ vst1_lane_f32(sum, SUMM_2[0], 0);
+}
+
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch) {
+ int i;
+ celt_assert(max_pitch > 0);
+ celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+
+ for (i = 0; i < (max_pitch-3); i += 4) {
+ xcorr_kernel_neon_float((float *)_x, (float *)_y+i,
+ (float *)xcorr+i, len);
+ }
+
+ /* In case max_pitch isn't multiple of 4
+ * compute single correlation value per iteration
+ */
+ for (; i < max_pitch; i++) {
+ xcorr_kernel_neon_float_process1((float *)_x, (float *)_y+i,
+ (float *)xcorr+i, len);
+ }
+}
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index a07f8ac..125d1bc 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -52,6 +52,17 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const
opus_val16 *_y,
((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
# endif
-# endif
+#else /* Start !FIXED_POINT */
+/* Float case */
+#if defined(OPUS_ARM_NEON_INTR)
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch);
+#if !defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_PITCH_XCORR (1)
+# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
+#endif
+#endif
+#endif /* end !FIXED_POINT */
#endif
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 71efff1..1d62e2f 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -31,7 +31,8 @@
#include "opus_types.h"
#include "opus_defines.h"
-#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
+#if defined(OPUS_HAVE_RTCD) && \
+ (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
#include "arm/armcpu.h"
/* We currently support 4 ARM variants:
diff --git a/celt/pitch.h b/celt/pitch.h
index 5c6e551..4368cc5 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,7 +46,8 @@
#include "mips/pitch_mipsr1.h"
#endif
-#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+ || defined(OPUS_ARM_NEON_INTR))
# include "arm/pitch_arm.h"
#endif
@@ -178,7 +179,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16
*_y,
#if !defined(OVERRIDE_PITCH_XCORR)
/*Is run-time CPU detection enabled on this platform?*/
-# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
+# if defined(OPUS_HAVE_RTCD) && \
+ (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
extern
# if defined(FIXED_POINT)
opus_val32
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 3076bbf..b9b1bcf 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -56,7 +56,11 @@
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#elif ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+ || defined(OPUS_ARM_NEON_INTR))
+#if defined(OPUS_ARM_NEON_INTR)
+#include "arm/celt_neon_intr.c"
+#endif
#include "arm/arm_celt_map.c"
#endif
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 37ba74e..5507884 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -54,7 +54,11 @@
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#elif ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+ || defined(OPUS_ARM_NEON_INTR))
+#if defined(OPUS_ARM_NEON_INTR)
+#include "arm/celt_neon_intr.c"
+#endif
#include "arm/arm_celt_map.c"
#endif
diff --git a/celt_sources.mk b/celt_sources.mk
index 20b1b1b..29ec937 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -32,3 +32,6 @@ celt/arm/celt_pitch_xcorr_arm.s
CELT_AM_SOURCES_ARM_ASM = \
celt/arm/armopts.s.in
+
+CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_neon_intr.c
diff --git a/configure.ac b/configure.ac
index 9b2f51f..1e6ab53 100644
--- a/configure.ac
+++ b/configure.ac
@@ -190,14 +190,14 @@ AC_ARG_ENABLE([rtcd],
[enable_rtcd=yes])
AC_ARG_ENABLE([intrinsics],
- [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations
(only for fixed point x86)])],,
+ [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations for
ARM(float) X86(fixed)])],,
[enable_intrinsics=no])
rtcd_support=no
cpu_arm=no
AS_IF([test x"${enable_asm}" = x"yes"],[
- inline_optimization="No ASM for your platform, please send
patches"
+ inline_optimization="No inline ASM for your platform, please send
patches"
case $host_cpu in
arm*)
dnl Currently we only have asm for fixed-point
@@ -343,7 +343,6 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
asm_optimization="disabled"
])
-AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
[test x"${inline_optimization%% *}" = x"ARM"])
AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
@@ -351,9 +350,58 @@ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
AM_CONDITIONAL([HAVE_SSE4_1], [false])
AM_CONDITIONAL([HAVE_SSE2], [false])
+
AS_IF([test x"$enable_intrinsics" = x"yes"],[
-AS_IF([test x"$enable_float" = x"no"],
-[AS_IF([test x"$host_cpu" = x"i386" -o
x"$host_cpu" = x"i686" -o x"$host_cpu" =
x"x86_64"],[
+ case $host_cpu in
+ arm*)
+ cpu_arm=yes
+ AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics)
+ save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
+ AC_LINK_IFELSE(
+ [
+ AC_LANG_PROGRAM(
+ [#include <arm_neon.h>
+ ],
+ [[
+ float32x4_t A[2], SUMM; SUMM = vmlaq_f32(SUMM, A[0], A[1]);
+ ]]
+ )
+ ],[
+ OPUS_ARM_NEON_INTR=1
+ AC_MSG_RESULT([yes])
+ ],[
+ OPUS_ARM_NEON_INTR=0
+ AC_MSG_RESULT([no])
+ ]
+ )
+ CFLAGS="$save_CFLAGS"
+ #Now we know if compiler supports ARM neon intrinsics or not
+
+ #Currently we only have intrinsic optimization for floating point
+ AS_IF([test x"$enable_float" = x"yes"],
+ [
+ AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"],
+ [
+ AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon
Intrinsics])
+ AS_IF([test x"enable_rtcd" != x""],
+ [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[])
+ enable_intrinsics="$enable_intrinsics
ARMv7_Neon_Intrinsics"
+ dnl Don't see why defining these is necessary to check features
at runtime
+ AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support
EDSP Instructions])
+ AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support
MEDIA Instructions])
+ AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support
NEON instructions])
+ ],
+ [
+ AC_MSG_WARN([Compiler does not support ARM intrinsics])
+ enable_intrinsics=no
+ ])
+ ], [
+ AC_MSG_WARN([Currently on have ARM intrinsics for float])
+ enable_intrinsics=no
+ ])
+ ;;
+ "i386" | "i686" | "x86_64")
+ AS_IF([test x"$enable_float" = x"no"],[
AS_IF([test x"$enable_rtcd" = x"yes"],[
get_cpuid_by_asm="no"
AC_MSG_CHECKING([Get CPU Info])
@@ -423,7 +471,7 @@ AS_IF([test x"$enable_float" = x"no"],
AM_CONDITIONAL([HAVE_SSE2], [true])
AS_IF([test x"$get_cpuid_by_asm" =
x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm
method])],
[AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])])
- ],[
+ ],[ ##### Else case for AS_IF([test x"$?" =
x"0"])
gcc -Q --help=target | grep "\-msse2 "
AC_MSG_CHECKING([sse2])
AS_IF([test x"$?" = x"0"],[
@@ -446,13 +494,28 @@ AS_IF([test x"$enable_float" = x"no"],
AM_CONDITIONAL([HAVE_SSE2], [true])
AS_IF([test x"$get_cpuid_by_asm" =
x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm
method])],
[AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c
method])])
- ],[enable_intrinsics="no"])
+ ],[enable_intrinsics="no"]) #End of AS_IF([test
x"$?" = x"0"]
])
- ], [enable_intrinsics="no"])
-])
-], [enable_intrinsics="no"])
+ ], [
+ enable_intrinsics="no"
+ ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"]
+],
+[ ## Else case for AS_IF([test x"$enable_float" = x"no"]
+ AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed
point])
+ enable_intrinsics="no"
+]) ## End of AS_IF([test x"$enable_float" = x"no"]
+ ;;
+ *)
+ AC_MSG_WARN([No intrinsics support for your architecture])
+ enable_intrinsics="no"
+ ;;
+ esac
])
+AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
+AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
+ [test x"$OPUS_ARM_NEON_INTR" = x"1"])
+
AS_IF([test x"$enable_rtcd" = x"yes"],[
AS_IF([test x"$rtcd_support" != x"no"],[
AC_DEFINE([OPUS_HAVE_RTCD], [1],
--
1.7.9.5
Viswanath Puttagunta
2014-Dec-19 23:43 UTC
[opus] [PATCH v1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
On 19 December 2014 at 17:25, Viswanath Puttagunta <viswanath.puttagunta at linaro.org> wrote:> Optimize celt_pitch_xcorr function (for floating point) > using ARM NEON intrinsics for SoCs that have NEON VFP unit. > > To enable this optimization, use --enable-intrinsics > configure option. > > Compile time and runtime checks are also supported to make sure > this optimization is only enabled when the compiler supports > neon intrinsics. > --- > Makefile.am | 12 ++ > celt/arm/arm_celt_map.c | 15 ++- > celt/arm/celt_neon_intr.c | 249 +++++++++++++++++++++++++++++++++++++++ > celt/arm/pitch_arm.h | 13 +- > celt/cpu_support.h | 3 +- > celt/pitch.h | 6 +- > celt/tests/test_unit_mathops.c | 6 +- > celt/tests/test_unit_rotation.c | 6 +- > celt_sources.mk | 3 + > configure.ac | 83 +++++++++++-- > 10 files changed, 377 insertions(+), 19 deletions(-) > create mode 100644 celt/arm/celt_neon_intr.c > > diff --git a/Makefile.am b/Makefile.am > index e20f7b4..95323ca 100644 > --- a/Makefile.am > +++ b/Makefile.am > @@ -41,6 +41,12 @@ endif > if CPU_ARM > CELT_SOURCES += $(CELT_SOURCES_ARM) > SILK_SOURCES += $(SILK_SOURCES_ARM) > + > +if OPUS_ARM_NEON_INTR > +CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR) > +OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon > +endif > + > if OPUS_ARM_EXTERNAL_ASM > nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) > BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \ > @@ -260,3 +266,9 @@ if HAVE_SSE2 > $(SSE_OBJ): CFLAGS += -msse2 > endif > endif > + > +if OPUS_ARM_NEON_INTR > +CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ > + %test_unit_rotation.o %test_unit_mathops.o > +$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS) > +endif > diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c > index 547a84d..ecdf7ec 100644 > --- a/celt/arm/arm_celt_map.c > +++ b/celt/arm/arm_celt_map.c > @@ -41,9 +41,18 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, > MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ > MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ > }; > -# else > -# error "Floating-point implementation is not supported by ARM asm yet." \ > - "Reconfigure with --disable-rtcd or send patches." > +# else /* !FIXED_POINT */ > +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, > + const opus_val16 *, opus_val32 *, int, int) = { > + celt_pitch_xcorr_c, /* ARMv4 */ > + celt_pitch_xcorr_c, /* EDSP */ > + celt_pitch_xcorr_c, /* Media */ > +#if defined(OPUS_ARM_NEON_INTR) > + celt_pitch_xcorr_float_neon /* Neon */ > +#else > + celt_pitch_xcorr_c /* Neon */ > +#endif > +}; > # endif > > #endif > diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c > new file mode 100644 > index 0000000..1787ae1 > --- /dev/null > +++ b/celt/arm/celt_neon_intr.c > @@ -0,0 +1,249 @@ > +/* Copyright (c) 2014-2015 Xiph.Org Foundation > + Written by Viswanath Puttagunta */ > +/** > + @file celt_neon_intr.c > + @brief ARM Neon Intrinsic optimizations for celt > + */ > + > +/* > + Redistribution and use in source and binary forms, with or without > + modification, are permitted provided that the following conditions > + are met: > + > + - Redistributions of source code must retain the above copyright > + notice, this list of conditions and the following disclaimer. > + > + - Redistributions in binary form must reproduce the above copyright > + notice, this list of conditions and the following disclaimer in the > + documentation and/or other materials provided with the distribution. > + > + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER > + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, > + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, > + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > +*/ > +#include <arm_neon.h> > +#include "../arch.h" > + > +/* > + * Function: xcorr_kernel_neon_float > + * --------------------------------- > + * Computes 4 correlation values and stores them in sum[4] > + */ > +static void xcorr_kernel_neon_float(const float *x, const float *y, > + float sum[4], int len) { > + float32x4_t YY[3]; > + float32x4_t YEXT[3]; > + float32x4_t XX[2]; > + float32x2_t XX_2; > + float32x4_t SUMM; > + const float *xi = x; > + const float *yi = y; > + > + celt_assert(len>0); > + > + YY[0] = vld1q_f32(yi); > + SUMM = vdupq_n_f32(0); > + > + /* Consume 8 elements in x vector and 12 elements in y > + * vector. However, the 12'th element never really gets > + * touched in this loop. So, if len == 8, then we only > + * must access y[0] to y[10]. y[11] must not be accessed > + * hence make sure len > 8 and not len >= 8 > + */ > + while (len > 8) { > + yi += 4; > + YY[1] = vld1q_f32(yi); > + yi += 4; > + YY[2] = vld1q_f32(yi); > + > + XX[0] = vld1q_f32(xi); > + xi += 4; > + XX[1] = vld1q_f32(xi); > + xi += 4; > + > + SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); > + YEXT[0] = vextq_f32(YY[0], YY[1], 1); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); > + YEXT[1] = vextq_f32(YY[0], YY[1], 2); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); > + YEXT[2] = vextq_f32(YY[0], YY[1], 3); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); > + > + SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0); > + YEXT[0] = vextq_f32(YY[1], YY[2], 1); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1); > + YEXT[1] = vextq_f32(YY[1], YY[2], 2); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0); > + YEXT[2] = vextq_f32(YY[1], YY[2], 3); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1); > + > + YY[0] = YY[2]; > + len -= 8; > + } > + > + /* Consume 4 elements in x vector and 8 elements in y > + * vector. However, the 8'th element in y never really gets > + * touched in this loop. So, if len == 4, then we only > + * must access y[0] to y[6]. y[7] must not be accessed > + * hence make sure len>4 and not len>=4 > + */ > + if (len > 4) { > + yi += 4; > + YY[1] = vld1q_f32(yi); > + > + XX[0] = vld1q_f32(xi); > + xi += 4; > + > + SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); > + YEXT[0] = vextq_f32(YY[0], YY[1], 1); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); > + YEXT[1] = vextq_f32(YY[0], YY[1], 2); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); > + YEXT[2] = vextq_f32(YY[0], YY[1], 3); > + SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); > + > + YY[0] = YY[1]; > + len -= 4; > + } > + > + yi++; > + while (len > 1) { > + XX_2 = vld1_dup_f32(xi++); > + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); > + YY[0]= vld1q_f32(yi++); > + len--; > + } > + > + if (len > 0) { > + XX_2 = vld1_dup_f32(xi); > + SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); > + }Hi Timothy, After analyzing celt_pitch_xcorr_arm.s, the closest I came using intrinsics is below code.. which didn't really put much dent in the performance.. so I just left it out since above code submitted is much simpler to read than below celt_pitch_xcorr_arm.s.. So, I request to leave it simple to read for now. float32x2_t YY_2; while (len > 0) { switch(len) { case 4: case 3: XX_2 = vld1_f32(xi); xi += 2; YY_2 = vld1_f32(yi+4); YY[1] = vcombine_f32(YY_2, YY_2); SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); YEXT[0] = vextq_f32(YY[0], YY[1], 1); SUMM = vmlaq_lane_f32(SUMM, YEXT[0], XX_2, 1); YY[0] = vcombine_f32(vget_high_f32(YY[0]), YY_2); len -=2; break; case 2: XX_2 = vld1_f32(xi); YY[1] = vld1q_f32(yi+1); SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); SUMM = vmlaq_lane_f32(SUMM, YY[1], XX_2, 1); len -= 2; break; case 1: XX_2 = vld1_f32(xi); SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); len--; break; } }> + > + vst1q_f32(sum, SUMM); > +} > + > +/* > + * Function: xcorr_kernel_neon_float_process1 > + * --------------------------------- > + * Computes single correlation values and stores in *sum > + */ > +static void xcorr_kernel_neon_float_process1(const float *x, const float *y, > + float *sum, int len) { > + float32x4_t XX[4]; > + float32x4_t YY[4]; > + float32x2_t XX_2; > + float32x2_t YY_2; > + float32x4_t SUMM; > + float32x2_t SUMM_2[2]; > + const float *xi = x; > + const float *yi = y; > + > + SUMM = vdupq_n_f32(0); > + > + /* Work on 16 values per iteration */ > + while (len >= 16) { > + XX[0] = vld1q_f32(xi); > + xi += 4; > + XX[1] = vld1q_f32(xi); > + xi += 4; > + XX[2] = vld1q_f32(xi); > + xi += 4; > + XX[3] = vld1q_f32(xi); > + xi += 4; > + > + YY[0] = vld1q_f32(yi); > + yi += 4; > + YY[1] = vld1q_f32(yi); > + yi += 4; > + YY[2] = vld1q_f32(yi); > + yi += 4; > + YY[3] = vld1q_f32(yi); > + yi += 4; > + > + SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); > + SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); > + SUMM = vmlaq_f32(SUMM, YY[2], XX[2]); > + SUMM = vmlaq_f32(SUMM, YY[3], XX[3]); > + len -= 16; > + } > + > + /* Work on 8 values */ > + if (len >= 8) { > + XX[0] = vld1q_f32(xi); > + xi += 4; > + XX[1] = vld1q_f32(xi); > + xi += 4; > + > + YY[0] = vld1q_f32(yi); > + yi += 4; > + YY[1] = vld1q_f32(yi); > + yi += 4; > + > + SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); > + SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); > + len -= 8; > + } > + > + /* Work on 4 values */ > + if (len >= 4) { > + XX[0] = vld1q_f32(xi); > + xi += 4; > + YY[0] = vld1q_f32(yi); > + yi += 4; > + SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); > + len -= 4; > + } > + > + /* Start accumulating results */ > + SUMM_2[0] = vget_low_f32(SUMM); > + if (len >= 2) { > + /* While at it, consume 2 more values if available */ > + XX_2 = vld1_f32(xi); > + xi += 2; > + YY_2 = vld1_f32(yi); > + yi += 2; > + SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2); > + len -= 2; > + } > + SUMM_2[1] = vget_high_f32(SUMM); > + SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]); > + SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]); > + /* Ok, now we have result accumulated in SUMM_2[0].0 */ > + > + if (len > 0) { > + /* Case when you have one value left */ > + XX_2 = vld1_dup_f32(xi); > + YY_2 = vld1_dup_f32(yi); > + SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2); > + } > + > + vst1_lane_f32(sum, SUMM_2[0], 0); > +} > + > +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, > + opus_val32 *xcorr, int len, int max_pitch) { > + int i; > + celt_assert(max_pitch > 0); > + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); > + > + for (i = 0; i < (max_pitch-3); i += 4) { > + xcorr_kernel_neon_float((float *)_x, (float *)_y+i, > + (float *)xcorr+i, len); > + } > + > + /* In case max_pitch isn't multiple of 4 > + * compute single correlation value per iteration > + */ > + for (; i < max_pitch; i++) { > + xcorr_kernel_neon_float_process1((float *)_x, (float *)_y+i, > + (float *)xcorr+i, len); > + } > +} > diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h > index a07f8ac..125d1bc 100644 > --- a/celt/arm/pitch_arm.h > +++ b/celt/arm/pitch_arm.h > @@ -52,6 +52,17 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, > ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) > # endif > > -# endif > +#else /* Start !FIXED_POINT */ > +/* Float case */ > +#if defined(OPUS_ARM_NEON_INTR) > +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, > + opus_val32 *xcorr, int len, int max_pitch); > +#if !defined(OPUS_HAVE_RTCD) > +#define OVERRIDE_PITCH_XCORR (1) > +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ > + ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch)) > +#endif > +#endif > > +#endif /* end !FIXED_POINT */ > #endif > diff --git a/celt/cpu_support.h b/celt/cpu_support.h > index 71efff1..1d62e2f 100644 > --- a/celt/cpu_support.h > +++ b/celt/cpu_support.h > @@ -31,7 +31,8 @@ > #include "opus_types.h" > #include "opus_defines.h" > > -#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) > +#if defined(OPUS_HAVE_RTCD) && \ > + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) > #include "arm/armcpu.h" > > /* We currently support 4 ARM variants: > diff --git a/celt/pitch.h b/celt/pitch.h > index 5c6e551..4368cc5 100644 > --- a/celt/pitch.h > +++ b/celt/pitch.h > @@ -46,7 +46,8 @@ > #include "mips/pitch_mipsr1.h" > #endif > > -#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT) > +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ > + || defined(OPUS_ARM_NEON_INTR)) > # include "arm/pitch_arm.h" > #endif > > @@ -178,7 +179,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, > > #if !defined(OVERRIDE_PITCH_XCORR) > /*Is run-time CPU detection enabled on this platform?*/ > -# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) > +# if defined(OPUS_HAVE_RTCD) && \ > + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) > extern > # if defined(FIXED_POINT) > opus_val32 > diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c > index 3076bbf..b9b1bcf 100644 > --- a/celt/tests/test_unit_mathops.c > +++ b/celt/tests/test_unit_mathops.c > @@ -56,7 +56,11 @@ > #include "x86/celt_lpc_sse.c" > #endif > #include "x86/x86_celt_map.c" > -#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT) > +#elif ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ > + || defined(OPUS_ARM_NEON_INTR)) > +#if defined(OPUS_ARM_NEON_INTR) > +#include "arm/celt_neon_intr.c" > +#endif > #include "arm/arm_celt_map.c" > #endif > > diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c > index 37ba74e..5507884 100644 > --- a/celt/tests/test_unit_rotation.c > +++ b/celt/tests/test_unit_rotation.c > @@ -54,7 +54,11 @@ > #include "x86/celt_lpc_sse.c" > #endif > #include "x86/x86_celt_map.c" > -#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT) > +#elif ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ > + || defined(OPUS_ARM_NEON_INTR)) > +#if defined(OPUS_ARM_NEON_INTR) > +#include "arm/celt_neon_intr.c" > +#endif > #include "arm/arm_celt_map.c" > #endif > > diff --git a/celt_sources.mk b/celt_sources.mk > index 20b1b1b..29ec937 100644 > --- a/celt_sources.mk > +++ b/celt_sources.mk > @@ -32,3 +32,6 @@ celt/arm/celt_pitch_xcorr_arm.s > > CELT_AM_SOURCES_ARM_ASM = \ > celt/arm/armopts.s.in > + > +CELT_SOURCES_ARM_NEON_INTR = \ > +celt/arm/celt_neon_intr.c > diff --git a/configure.ac b/configure.ac > index 9b2f51f..1e6ab53 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -190,14 +190,14 @@ AC_ARG_ENABLE([rtcd], > [enable_rtcd=yes]) > > AC_ARG_ENABLE([intrinsics], > - [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations (only for fixed point x86)])],, > + [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations for ARM(float) X86(fixed)])],, > [enable_intrinsics=no]) > > rtcd_support=no > cpu_arm=no > > AS_IF([test x"${enable_asm}" = x"yes"],[ > - inline_optimization="No ASM for your platform, please send patches" > + inline_optimization="No inline ASM for your platform, please send patches" > case $host_cpu in > arm*) > dnl Currently we only have asm for fixed-point > @@ -343,7 +343,6 @@ AS_IF([test x"${enable_asm}" = x"yes"],[ > asm_optimization="disabled" > ]) > > -AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) > AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], > [test x"${inline_optimization%% *}" = x"ARM"]) > AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], > @@ -351,9 +350,58 @@ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], > > AM_CONDITIONAL([HAVE_SSE4_1], [false]) > AM_CONDITIONAL([HAVE_SSE2], [false]) > + > AS_IF([test x"$enable_intrinsics" = x"yes"],[ > -AS_IF([test x"$enable_float" = x"no"], > -[AS_IF([test x"$host_cpu" = x"i386" -o x"$host_cpu" = x"i686" -o x"$host_cpu" = x"x86_64"],[ > + case $host_cpu in > + arm*) > + cpu_arm=yes > + AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics) > + save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" > + AC_LINK_IFELSE( > + [ > + AC_LANG_PROGRAM( > + [#include <arm_neon.h> > + ], > + [[ > + float32x4_t A[2], SUMM; SUMM = vmlaq_f32(SUMM, A[0], A[1]); > + ]] > + ) > + ],[ > + OPUS_ARM_NEON_INTR=1 > + AC_MSG_RESULT([yes]) > + ],[ > + OPUS_ARM_NEON_INTR=0 > + AC_MSG_RESULT([no]) > + ] > + ) > + CFLAGS="$save_CFLAGS" > + #Now we know if compiler supports ARM neon intrinsics or not > + > + #Currently we only have intrinsic optimization for floating point > + AS_IF([test x"$enable_float" = x"yes"], > + [ > + AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], > + [ > + AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics]) > + AS_IF([test x"enable_rtcd" != x""], > + [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[]) > + enable_intrinsics="$enable_intrinsics ARMv7_Neon_Intrinsics" > + dnl Don't see why defining these is necessary to check features at runtime > + AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support EDSP Instructions]) > + AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support MEDIA Instructions]) > + AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions]) > + ], > + [ > + AC_MSG_WARN([Compiler does not support ARM intrinsics]) > + enable_intrinsics=no > + ]) > + ], [ > + AC_MSG_WARN([Currently on have ARM intrinsics for float]) > + enable_intrinsics=no > + ]) > + ;; > + "i386" | "i686" | "x86_64") > + AS_IF([test x"$enable_float" = x"no"],[ > AS_IF([test x"$enable_rtcd" = x"yes"],[ > get_cpuid_by_asm="no" > AC_MSG_CHECKING([Get CPU Info]) > @@ -423,7 +471,7 @@ AS_IF([test x"$enable_float" = x"no"], > AM_CONDITIONAL([HAVE_SSE2], [true]) > AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], > [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])]) > - ],[ > + ],[ ##### Else case for AS_IF([test x"$?" = x"0"]) > gcc -Q --help=target | grep "\-msse2 " > AC_MSG_CHECKING([sse2]) > AS_IF([test x"$?" = x"0"],[ > @@ -446,13 +494,28 @@ AS_IF([test x"$enable_float" = x"no"], > AM_CONDITIONAL([HAVE_SSE2], [true]) > AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], > [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])]) > - ],[enable_intrinsics="no"]) > + ],[enable_intrinsics="no"]) #End of AS_IF([test x"$?" = x"0"] > ]) > - ], [enable_intrinsics="no"]) > -]) > -], [enable_intrinsics="no"]) > + ], [ > + enable_intrinsics="no" > + ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"] > +], > +[ ## Else case for AS_IF([test x"$enable_float" = x"no"] > + AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed point]) > + enable_intrinsics="no" > +]) ## End of AS_IF([test x"$enable_float" = x"no"] > + ;; > + *) > + AC_MSG_WARN([No intrinsics support for your architecture]) > + enable_intrinsics="no" > + ;; > + esac > ]) > > +AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) > +AM_CONDITIONAL([OPUS_ARM_NEON_INTR], > + [test x"$OPUS_ARM_NEON_INTR" = x"1"]) > + > AS_IF([test x"$enable_rtcd" = x"yes"],[ > AS_IF([test x"$rtcd_support" != x"no"],[ > AC_DEFINE([OPUS_HAVE_RTCD], [1], > -- > 1.7.9.5 >
Reasonably Related Threads
- [RFC PATCH v3] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [PATCH v1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC PATCH v2] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
- [RFC V3 5/8] aarch64: celt_pitch_xcorr: Fixed point intrinsics
- [[RFC PATCH v2]: Ne10 fft fixed and previous 5/8] aarch64: celt_pitch_xcorr: Fixed point intrinsics