Linfeng Zhang
2016-Sep-13  00:03 UTC
[opus] [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
Should call celt_inner_prod().
---
 celt/bands.c                   | 7 ++++---
 celt/bands.h                   | 2 +-
 celt/celt_encoder.c            | 6 +++---
 celt/pitch.c                   | 2 +-
 src/opus_multistream_encoder.c | 2 +-
 5 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/celt/bands.c b/celt/bands.c
index bbe8a4c..1ab24aa 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -92,10 +92,11 @@ static int bitexact_log2tan(int isin,int icos)
 
 #ifdef FIXED_POINT
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM, int arch)
 {
    int i, c, N;
    const opus_int16 *eBands = m->eBands;
+   (void)arch;
    N = m->shortMdctSize<<LM;
    c=0; do {
       for (i=0;i<end;i++)
@@ -155,7 +156,7 @@ void normalise_bands(const CELTMode *m, const celt_sig *
OPUS_RESTRICT freq, cel
 
 #else /* FIXED_POINT */
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM, int arch)
 {
    int i, c, N;
    const opus_int16 *eBands = m->eBands;
@@ -164,7 +165,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig
*X, celt_ener *band
       for (i=0;i<end;i++)
       {
          opus_val32 sum;
-         sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)],
&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
+         sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)],
&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch);
          bandE[i+c*m->nbEBands] = celt_sqrt(sum);
          /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
       }
diff --git a/celt/bands.h b/celt/bands.h
index c040c7f..61ae0cd 100644
--- a/celt/bands.h
+++ b/celt/bands.h
@@ -41,7 +41,7 @@
  * @param X Spectrum
  * @param bandE Square root of the energy for each band (returned)
  */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM);
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM, int arch);
 
 /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const
opus_val16 *tonality, celt_ener *bandE);*/
 
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 5be7610..8af61d5 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1606,7 +1606,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st,
const opus_val16 * pcm,
    if (secondMdct)
    {
       compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample,
st->arch);
-      compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+      compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
       amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
       for (i=0;i<C*nbEBands;i++)
          bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
@@ -1615,7 +1615,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st,
const opus_val16 * pcm,
    compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample,
st->arch);
    if (CC==2&&C==1)
       tf_chan = 0;
-   compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+   compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
 
    if (st->lfe)
    {
@@ -1739,7 +1739,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st,
const opus_val16 * pcm,
          isTransient = 1;
          shortBlocks = M;
          compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample,
st->arch);
-         compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+         compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
          amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
          /* Compensate for the scaling of short vs long mdcts */
          for (i=0;i<C*nbEBands;i++)
diff --git a/celt/pitch.c b/celt/pitch.c
index bf46e7d..f944a33 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -378,7 +378,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp,
opus_val16 * OPUS_RESTR
       for (j=0;j<len>>1;j++)
          sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
 #else
-      sum = celt_inner_prod_c(x_lp, y+i, len>>1);
+      sum = celt_inner_prod(x_lp, y+i, len>>1, arch);
 #endif
       xcorr[i] = MAX32(-1, sum);
 #ifdef FIXED_POINT
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index c07132f..6ecea5d 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -295,7 +295,7 @@ void surround_analysis(const CELTMode *celt_mode, const void
*pcm, opus_val16 *b
             freq[i] = 0;
       }
 
-      compute_band_energies(celt_mode, freq, bandE, 21, 1, LM);
+      compute_band_energies(celt_mode, freq, bandE, 21, 1, LM, arch);
       amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1);
       /* Apply spreading function with -6 dB/band going up and -12 dB/band
going down. */
       for (i=1;i<21;i++)
-- 
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13  00:03 UTC
[opus] [PATCH 13/15] Replace call of celt_inner_prod_c() (step 2)
Should call celt_inner_prod().
This requires the API change of celt_pitch_xcorr() by passing in
"arch".
---
 celt/arm/arm_celt_map.c   |  4 ++--
 celt/arm/celt_neon_intr.c |  3 ++-
 celt/arm/pitch_arm.h      | 24 +++++++++---------------
 celt/pitch.c              | 13 -------------
 celt/pitch.h              | 11 ++---------
 5 files changed, 15 insertions(+), 40 deletions(-)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 1d53d62..6e28c70 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -51,7 +51,7 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16
*,
     (defined(OPUS_ARM_MAY_HAVE_MEDIA) &&
!defined(OPUS_ARM_PRESUME_MEDIA)) || \
     (defined(OPUS_ARM_MAY_HAVE_EDSP) &&
!defined(OPUS_ARM_PRESUME_EDSP)))
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-    const opus_val16 *, opus_val32 *, int , int) = {
+    const opus_val16 *, opus_val32 *, int, int, int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
   MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
@@ -62,7 +62,7 @@ opus_val32 (*const
CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
 # else /* !FIXED_POINT */
 #  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-    const opus_val16 *, opus_val32 *, int, int) = {
+    const opus_val16 *, opus_val32 *, int, int, int) = {
   celt_pitch_xcorr_c,              /* ARMv4 */
   celt_pitch_xcorr_c,              /* EDSP */
   celt_pitch_xcorr_c,              /* Media */
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index 47bbe3d..f8f1d43 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -290,8 +290,9 @@ static void xcorr_kernel_neon_float_process1(const float32_t
*x,
 }
 
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
-                        opus_val32 *xcorr, int len, int max_pitch) {
+                        opus_val32 *xcorr, int len, int max_pitch, int arch) {
    int i;
+   (void)arch;
    celt_assert(max_pitch > 0);
    celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index 1433116..d8b022e 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -34,7 +34,7 @@
 
 #  if defined(OPUS_ARM_MAY_HAVE_NEON)
 opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
-    opus_val32 *xcorr, int len, int max_pitch);
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
 #  endif
 
 #  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
@@ -43,7 +43,7 @@ opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const
opus_val16 *_y,
 
 #  if defined(OPUS_ARM_MAY_HAVE_EDSP)
 opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
-    opus_val32 *xcorr, int len, int max_pitch);
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
 #  endif
 
 #  if defined(OPUS_HAVE_RTCD) && \
@@ -52,18 +52,15 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const
opus_val16 *_y,
      (defined(OPUS_ARM_MAY_HAVE_EDSP) &&
!defined(OPUS_ARM_PRESUME_EDSP)))
 extern opus_val32
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int, int);
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+#   define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
 
 #  elif defined(OPUS_ARM_PRESUME_EDSP) || \
     defined(OPUS_ARM_PRESUME_MEDIA) || \
     defined(OPUS_ARM_PRESUME_NEON)
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+#   define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
 
 #  endif
 
@@ -99,25 +96,22 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
 /* Float case */
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
-                                 opus_val32 *xcorr, int len, int max_pitch);
+                                 opus_val32 *xcorr, int len, int max_pitch, int
arch);
 #endif
 
 #  if defined(OPUS_HAVE_RTCD) && \
     (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR))
 extern void
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int, int);
 
 #  define OVERRIDE_PITCH_XCORR (1)
-#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+#  define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
 
 #  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
 
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-   ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
+#   define celt_pitch_xcorr celt_pitch_xcorr_float_neon
 
 #  endif
 
diff --git a/celt/pitch.c b/celt/pitch.c
index f944a33..874929e 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -220,13 +220,8 @@ opus_val32
 #else
 void
 #endif
-#if defined(OVERRIDE_PITCH_XCORR)
 celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
-      opus_val32 *xcorr, int len, int max_pitch)
-#else
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
       opus_val32 *xcorr, int len, int max_pitch, int arch)
-#endif
 {
 
 #if 0 /* This is a simple version of the pitch correlation that should work
@@ -265,11 +260,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16
*_y,
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-#if defined(OVERRIDE_PITCH_XCORR)
-      xcorr_kernel_c(_x, _y+i, sum, len);
-#else
       xcorr_kernel(_x, _y+i, sum, len, arch);
-#endif
       xcorr[i]=sum[0];
       xcorr[i+1]=sum[1];
       xcorr[i+2]=sum[2];
@@ -285,11 +276,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16
*_y,
    for (;i<max_pitch;i++)
    {
       opus_val32 sum;
-#if defined(OVERRIDE_PITCH_XCORR)
-      sum = celt_inner_prod_c(_x, _y+i, len);
-#else
       sum = celt_inner_prod(_x, _y+i, len, arch);
-#endif
       xcorr[i] = sum;
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
diff --git a/celt/pitch.h b/celt/pitch.h
index d350353..d797844 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -184,17 +184,10 @@ opus_val32
 void
 #endif
 celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
-      opus_val32 *xcorr, int len, int max_pitch);
-
-#if !defined(OVERRIDE_PITCH_XCORR)
-#ifdef FIXED_POINT
-opus_val32
-#else
-void
-#endif
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
       opus_val32 *xcorr, int len, int max_pitch, int arch);
 
+#ifndef OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_c
 #endif
 
 #endif
-- 
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13  00:03 UTC
[opus] [PATCH 14/15] Optimize celt_inner_prod() and dual_inner_prod() for ARM NEON
Created corresponding unit test.
The fixed-point optimizations are bit exact with C functions.
The floating-point optimizations are not bit exact with C functions, because of
the order changes of floating-point operations. But they are bit exact with the
simulation C functions which stimulate the floating operations in the
optimizations.
---
 celt/arm/arm_celt_map.c                   |  17 ++
 celt/arm/pitch_arm.h                      |  36 ++++
 celt/arm/pitch_neon_intr.c                | 179 ++++++++++++++++++++
 celt/pitch.h                              |   3 +-
 celt/tests/test_unit_dft.c                |   1 +
 celt/tests/test_unit_mathops.c            |   1 +
 celt/tests/test_unit_mdct.c               |   1 +
 celt/tests/test_unit_optimization_pitch.c | 263 ++++++++++++++++++++++++++++++
 celt/tests/test_unit_rotation.c           |   1 +
 celt/x86/pitch_sse.h                      |   5 +-
 celt_sources.mk                           |   3 +-
 tests/test_unit_optimization.c            |   4 +
 12 files changed, 508 insertions(+), 6 deletions(-)
 create mode 100644 celt/arm/pitch_neon_intr.c
 create mode 100644 celt/tests/test_unit_optimization_pitch.c
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 6e28c70..a1a553a 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -36,6 +36,23 @@
 
 #if defined(OPUS_HAVE_RTCD)
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
const opus_val16 *y, int N) = {
+  celt_inner_prod_c,             /* ARMv4 */
+  celt_inner_prod_c,             /* EDSP */
+  celt_inner_prod_c,             /* Media */
+  MAY_HAVE_NEON(celt_inner_prod) /* NEON */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const
opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2) = {
+  dual_inner_prod_c,             /* ARMv4 */
+  dual_inner_prod_c,             /* EDSP */
+  dual_inner_prod_c,             /* Media */
+  MAY_HAVE_NEON(dual_inner_prod) /* NEON */
+};
+# endif
+
 # if defined(FIXED_POINT)
 #  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index d8b022e..d1a8db0 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -30,6 +30,42 @@
 
 # include "armcpu.h"
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int
N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+        const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+#  define OVERRIDE_CELT_INNER_PROD (1)
+#  define OVERRIDE_DUAL_INNER_PROD (1)
+#  define celt_inner_prod(x, y, N, arch) ((void)(arch),
PRESUME_NEON(celt_inner_prod)(x, y, N))
+#  define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),
PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
&& !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const
opus_val16 *x, const opus_val16 *y, int N);
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch)
((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch),
celt_inner_prod_neon(x, y, N))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
&& !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+        const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1,
opus_val32 *xy2);
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch)
((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),
dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
 # if defined(FIXED_POINT)
 
 #  if defined(OPUS_ARM_MAY_HAVE_NEON)
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000..2bda6e1
--- /dev/null
+++ b/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file pitch_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt pitch functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int
N)
+{
+    int i;
+    opus_val32 xy;
+
+#ifdef FIXED_POINT
+    int16x8_t x_s16x8, y_s16x8;
+    int32x4_t xy_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy_s64x2;
+    int64x1_t xy_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8  = vld1q_s16(&x[i]);
+        y_s16x8  = vld1q_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16
(y_s16x8));
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8),
vget_high_s16(y_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+        i += 4;
+    }
+
+    xy_s64x2 = vpaddlq_s32(xy_s32x4);
+    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+#else
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+#endif
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const
opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+
+#ifdef FIXED_POINT
+    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy01_s64x2, xy02_s64x2;
+    int64x1_t xy01_s64x1, xy02_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8    = vld1q_s16(&x[i]);
+        y01_s16x8  = vld1q_s16(&y01[i]);
+        y02_s16x8  = vld1q_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16
(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16
(y02_s16x8));
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8),
vget_high_s16(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8),
vget_high_s16(y02_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
+        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+        i += 4;
+    }
+
+    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+#else
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+#endif
+
+    for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
diff --git a/celt/pitch.h b/celt/pitch.h
index d797844..e425f56 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,8 +46,7 @@
 #include "mips/pitch_mipsr1.h"
 #endif
 
-#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
-  || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 # include "arm/pitch_arm.h"
 #endif
 
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 582618e..02904bf 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -54,6 +54,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "mdct.c"
 #   include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index f5af994..524c1f8 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -69,6 +69,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
 #   include "mdct.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 0658c7a..3b28767 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -55,6 +55,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "arm/celt_ne10_fft.c"
 #   include "arm/celt_ne10_mdct.c"
diff --git a/celt/tests/test_unit_optimization_pitch.c
b/celt/tests/test_unit_optimization_pitch.c
new file mode 100644
index 0000000..64bb2a9
--- /dev/null
+++ b/celt/tests/test_unit_optimization_pitch.c
@@ -0,0 +1,263 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "modes.h"
+#include "pitch.h"
+
+#define MAX_LEN_INNER_PROD 960
+
+#ifndef UNIT_TEST_CELT_INNER_PROD
+#define UNIT_TEST_CELT_INNER_PROD
+
+static inline float rand_float(float min, float max)
+{
+    return ((max - min) * ((float)rand() / RAND_MAX)) + min;
+}
+
+static OPUS_INLINE opus_val16 rand_val16(opus_val16 min, opus_val16 max)
+{
+#ifdef FIXED_POINT
+    (void)min;
+    (void)max;
+    return rand();
+#else
+    return rand_float(min, max);
+#endif
+}
+
+static OPUS_INLINE void init_val16_buffer(opus_val16* buffer, int num)
+{
+    const opus_val16 min = (opus_val16)-1e10;
+    const opus_val16 max = (opus_val16) 1e10;
+
+    for (int i = 0; i < num; i++) {
+        buffer[i] = rand_val16(min, max);
+    }
+}
+
+#endif
+
+/* ==========================================================================
*/
+/* This part of code simulates floating-point operations. */
+
+#ifndef FIXED_POINT
+
+/* celt_inner_prod_float_simulation_sse() simulates the floating operations of
+ * celt_inner_prod_sse(), and both functions should have bit exact output.
+ */
+opus_val32 celt_inner_prod_float_simulation_sse(const opus_val16 *x,
+      const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
+/* dual_inner_prod_float_simulation_sse() simulates the floating-point
operations
+ * of dual_inner_prod_sse(), and both functions should have bit exact output.
+ */
+void dual_inner_prod_float_simulation_sse(const opus_val16 *x, const opus_val16
*y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0,
xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+   }
+   xy01_0 += xy01_2;
+   xy02_0 += xy02_2;
+   xy01_1 += xy01_3;
+   xy02_1 += xy02_3;
+   xy01 = xy01_0 + xy01_1;
+   xy02 = xy02_0 + xy02_1;
+   for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+# define celt_inner_prod_float_simulation_c celt_inner_prod_c
+# define dual_inner_prod_float_simulation_c dual_inner_prod_c
+
+/* Reuse since NEON optimizations happen to have the same simulated
floating-point operations as SSE optimization. */
+# define celt_inner_prod_float_simulation_neon
celt_inner_prod_float_simulation_sse
+# define dual_inner_prod_float_simulation_neon
dual_inner_prod_float_simulation_sse
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+#  define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+#  define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+#  ifdef OPUS_X86_PRESUME_SSE
+#   define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),
celt_inner_prod_float_simulation_sse(x, y, N))
+#   define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch), dual_inner_prod_float_simulation_sse(x, y01, y02, N, xy1, xy2))
+#  else
+#   define celt_inner_prod_float_simulation(x, y, N, arch)
((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+#   define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01,
y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK +
1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+        celt_inner_prod_float_simulation_c,                /* non-sse */
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation)
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const
opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32
*xy1, opus_val32 *xy2) = {
+        dual_inner_prod_float_simulation_c,                /* non-sse */
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation)
+};
+#  endif /* !defined(OPUS_X86_PRESUME_SSE) */
+# endif /* OPUS_X86_MAY_HAVE_SSE */
+
+# ifdef OPUS_ARM_MAY_HAVE_NEON_INTR
+#  define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+#  define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+#  ifndef OPUS_HAVE_RTCD
+#   define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),
PRESUME_NEON(celt_inner_prod_float_simulation)(x, y, N))
+#   define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch), PRESUME_NEON(dual_inner_prod_float_simulation)(x, y01, y02, N,
xy1, xy2))
+#  else
+#   ifdef OPUS_ARM_PRESUME_NEON_INTR
+#    define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),
celt_inner_prod_float_simulation_neon(x, y, N))
+#    define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch), dual_inner_prod_float_simulation_neon(x, y01, y02, N, xy1, xy2))
+#   else
+#    define celt_inner_prod_float_simulation(x, y, N, arch)
((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+#    define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01,
y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK +
1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+        celt_inner_prod_float_simulation_c,             /* ARMv4 */
+        celt_inner_prod_float_simulation_c,             /* EDSP */
+        celt_inner_prod_float_simulation_c,             /* Media */
+        MAY_HAVE_NEON(celt_inner_prod_float_simulation) /* NEON */
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const
opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32
*xy1, opus_val32 *xy2) = {
+        dual_inner_prod_float_simulation_c,             /* ARMv4 */
+        dual_inner_prod_float_simulation_c,             /* EDSP */
+        dual_inner_prod_float_simulation_c,             /* Media */
+        MAY_HAVE_NEON(dual_inner_prod_float_simulation) /* NEON */
+};
+#   endif /* !defined(OPUS_ARM_PRESUME_NEON_INTR) */
+#  endif /* OPUS_HAVE_RTCD */
+# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+# ifndef OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION
+#  define celt_inner_prod_float_simulation(x, y, N, arch)
((void)(arch),celt_inner_prod_float_simulation_c(x, y, N))
+# endif
+
+# ifndef OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION
+#  define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch),dual_inner_prod_float_simulation_c(x, y01, y02, N, xy1, xy2))
+# endif
+
+#endif /* !FIXED_POINT */
+
+/* ==========================================================================
*/
+
+static int test_celt_inner_prod(int arch)
+{
+    opus_val16 x[MAX_LEN_INNER_PROD], y[MAX_LEN_INNER_PROD];
+    opus_val32 xy_org, xy_opt;
+    int N;
+
+    printf("%44s() ...", __func__);
+    init_val16_buffer(x, MAX_LEN_INNER_PROD);
+    init_val16_buffer(y, MAX_LEN_INNER_PROD);
+    for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+        xy_org = celt_inner_prod_c(x, y, N);
+#else
+        xy_org = celt_inner_prod_float_simulation(x, y, N, arch);
+#endif
+        xy_opt = celt_inner_prod(x, y, N, arch);
+        if (xy_org != xy_opt) {
+#ifdef FIXED_POINT
+            printf("\nN=%d xy_org = %d, xy_opt = %d failed!", N,
xy_org, xy_opt);
+#else
+            printf("\nN=%d xy_org = %f, xy_opt = %f failed!", N,
xy_org, xy_opt);
+#endif
+            return -1;
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
+
+static int test_dual_inner_prod(int arch)
+{
+    opus_val16 x[MAX_LEN_INNER_PROD], y01[MAX_LEN_INNER_PROD],
y02[MAX_LEN_INNER_PROD];
+    opus_val32 xy1_org, xy1_opt, xy2_org, xy2_opt;
+    int N;
+
+    printf("%44s() ...", __func__);
+    init_val16_buffer(x,   MAX_LEN_INNER_PROD);
+    init_val16_buffer(y01, MAX_LEN_INNER_PROD);
+    init_val16_buffer(y02, MAX_LEN_INNER_PROD);
+    for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+        dual_inner_prod_c(x, y01, y02, N, &xy1_org, &xy2_org);
+#else
+        dual_inner_prod_float_simulation(x, y01, y02, N, &xy1_org,
&xy2_org, arch);
+#endif
+        dual_inner_prod(x, y01, y02, N, &xy1_opt, &xy2_opt, arch);
+        if ((xy1_org != xy1_opt) || (xy2_org != xy2_opt)) {
+#ifdef FIXED_POINT
+            printf("\nN=%d xy1_org = %d, xy1_opt = %d failed!", N,
xy1_org, xy1_opt);
+            printf("\nN=%d xy2_org = %d, xy2_opt = %d failed!", N,
xy2_org, xy2_opt);
+#else
+            printf("\nN=%d xy1_org = %f, xy1_opt = %f failed!", N,
xy1_org, xy1_opt);
+            printf("\nN=%d xy2_org = %f, xy2_opt = %f failed!", N,
xy2_org, xy2_opt);
+#endif
+            return -1;
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 785b40d..0b73839 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -67,6 +67,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
 #   include "mdct.c"
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index e5f87ab..5e85599 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -91,7 +91,7 @@ opus_val32 celt_inner_prod_sse2(
     int               N);
 #endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 opus_val32 celt_inner_prod_sse(
     const opus_val16 *x,
     const opus_val16 *y,
@@ -104,7 +104,7 @@ opus_val32 celt_inner_prod_sse(
 #define celt_inner_prod(x, y, N, arch) \
     ((void)arch, celt_inner_prod_sse4_1(x, y, N))
 
-#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) &&
!defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT)
 #define OVERRIDE_CELT_INNER_PROD
 #define celt_inner_prod(x, y, N, arch) \
     ((void)arch, celt_inner_prod_sse2(x, y, N))
@@ -114,7 +114,6 @@ opus_val32 celt_inner_prod_sse(
 #define celt_inner_prod(x, y, N, arch) \
     ((void)arch, celt_inner_prod_sse(x, y, N))
 
-
 #elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
&& defined(FIXED_POINT)) || \
     (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
 
diff --git a/celt_sources.mk b/celt_sources.mk
index c4bd285..dc107d9 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -38,7 +38,8 @@ celt/arm/armopts.s.in
 
 CELT_SOURCES_ARM_NEON_INTR = \
 celt/arm/celt_lpc_neon_intr.c \
-celt/arm/celt_neon_intr.c
+celt/arm/celt_neon_intr.c \
+celt/arm/pitch_neon_intr.c
 
 CELT_SOURCES_ARM_NE10= \
 celt/arm/celt_ne10_fft.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 6155dfb..a88ac21 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -71,6 +71,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "celt/arm/celt_lpc_neon_intr.c"
 #  include "celt/arm/celt_neon_intr.c"
+#  include "celt/arm/pitch_neon_intr.c"
 #  include "silk/arm/biquad_alt_neon_intr.c"
 #  include "silk/arm/inner_prod_aligned_neon_intr.c"
 #  include "silk/arm/LPC_analysis_filter_neon_intr.c"
@@ -94,6 +95,7 @@
 
 #endif
 
+# include "celt/tests/test_unit_optimization_pitch.c"
 # include "silk/tests/test_unit_optimization_biquad_alt.c"
 # include "silk/tests/test_unit_optimization_inner_prod_aligned.c"
 # include "silk/tests/test_unit_optimization_LPC_analysis_filter.c"
@@ -118,6 +120,8 @@ int main(void)
       result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
       result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
+      result |= test_celt_inner_prod(arch);
+      result |= test_dual_inner_prod(arch);
       result |= test_silk_biquad_alt(arch);
       result |= test_silk_inner_prod_aligned_scale(arch);
       result |= test_silk_LPC_analysis_filter(arch);
-- 
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13  00:03 UTC
[opus] [PATCH 15/15] Clean celt_pitch_xcorr_float_neon()
Call celt_inner_prod_neon() and remove redundant code.
---
 celt/arm/celt_neon_intr.c | 105 +---------------------------------------------
 1 file changed, 2 insertions(+), 103 deletions(-)
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index f8f1d43..cf44398 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -191,104 +191,6 @@ static void xcorr_kernel_neon_float(const float32_t *x,
const float32_t *y,
    vst1q_f32(sum, SUMM);
 }
 
-/*
- * Function: xcorr_kernel_neon_float_process1
- * ---------------------------------
- * Computes single correlation values and stores in *sum
- */
-static void xcorr_kernel_neon_float_process1(const float32_t *x,
-      const float32_t *y, float32_t *sum, int len) {
-   float32x4_t XX[4];
-   float32x4_t YY[4];
-   float32x2_t XX_2;
-   float32x2_t YY_2;
-   float32x4_t SUMM;
-   float32x2_t SUMM_2[2];
-   const float32_t *xi = x;
-   const float32_t *yi = y;
-
-   SUMM = vdupq_n_f32(0);
-
-   /* Work on 16 values per iteration */
-   while (len >= 16) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      XX[1] = vld1q_f32(xi);
-      xi += 4;
-      XX[2] = vld1q_f32(xi);
-      xi += 4;
-      XX[3] = vld1q_f32(xi);
-      xi += 4;
-
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      YY[1] = vld1q_f32(yi);
-      yi += 4;
-      YY[2] = vld1q_f32(yi);
-      yi += 4;
-      YY[3] = vld1q_f32(yi);
-      yi += 4;
-
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
-      SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
-      SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
-      len -= 16;
-   }
-
-   /* Work on 8 values */
-   if (len >= 8) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      XX[1] = vld1q_f32(xi);
-      xi += 4;
-
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      YY[1] = vld1q_f32(yi);
-      yi += 4;
-
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
-      len -= 8;
-   }
-
-   /* Work on 4 values */
-   if (len >= 4) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      len -= 4;
-   }
-
-   /* Start accumulating results */
-   SUMM_2[0] = vget_low_f32(SUMM);
-   if (len >= 2) {
-      /* While at it, consume 2 more values if available */
-      XX_2 = vld1_f32(xi);
-      xi += 2;
-      YY_2 = vld1_f32(yi);
-      yi += 2;
-      SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
-      len -= 2;
-   }
-   SUMM_2[1] = vget_high_f32(SUMM);
-   SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
-   SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
-   /* Ok, now we have result accumulated in SUMM_2[0].0 */
-
-   if (len > 0) {
-      /* Case when you have one value left */
-      XX_2 = vld1_dup_f32(xi);
-      YY_2 = vld1_dup_f32(yi);
-      SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
-   }
-
-   vst1_lane_f32(sum, SUMM_2[0], 0);
-}
-
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
                         opus_val32 *xcorr, int len, int max_pitch, int arch) {
    int i;
@@ -301,12 +203,9 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x,
const opus_val16 *_y,
             (float32_t *)xcorr+i, len);
    }
 
-   /* In case max_pitch isn't multiple of 4
-    * compute single correlation value per iteration
-    */
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
    for (; i < max_pitch; i++) {
-      xcorr_kernel_neon_float_process1((const float32_t *)_x,
-            (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
+      xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
    }
 }
 #endif
-- 
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13  00:17 UTC
[opus] [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
I'm attaching the previous 11 NEON optimization patches in a zip file, which is the same as the one I sent on September 6th in reply to thread "[PATCH 9/9] Optimize silk_inner_prod_aligned_scale() for ARM NEON". Thanks, Linfeng On Mon, Sep 12, 2016 at 5:03 PM, Linfeng Zhang <linfengz at google.com> wrote:> Should call celt_inner_prod(). > --- > celt/bands.c | 7 ++++--- > celt/bands.h | 2 +- > celt/celt_encoder.c | 6 +++--- > celt/pitch.c | 2 +- > src/opus_multistream_encoder.c | 2 +- > 5 files changed, 10 insertions(+), 9 deletions(-) > > diff --git a/celt/bands.c b/celt/bands.c > index bbe8a4c..1ab24aa 100644 > --- a/celt/bands.c > +++ b/celt/bands.c > @@ -92,10 +92,11 @@ static int bitexact_log2tan(int isin,int icos) > > #ifdef FIXED_POINT > /* Compute the amplitude (sqrt energy) in each of the bands */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM) > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch) > { > int i, c, N; > const opus_int16 *eBands = m->eBands; > + (void)arch; > N = m->shortMdctSize<<LM; > c=0; do { > for (i=0;i<end;i++) > @@ -155,7 +156,7 @@ void normalise_bands(const CELTMode *m, const celt_sig > * OPUS_RESTRICT freq, cel > > #else /* FIXED_POINT */ > /* Compute the amplitude (sqrt energy) in each of the bands */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM) > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch) > { > int i, c, N; > const opus_int16 *eBands = m->eBands; > @@ -164,7 +165,7 @@ void compute_band_energies(const CELTMode *m, const > celt_sig *X, celt_ener *band > for (i=0;i<end;i++) > { > opus_val32 sum; > - sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], > &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM); > + sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], > &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch); > bandE[i+c*m->nbEBands] = celt_sqrt(sum); > /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ > } > diff --git a/celt/bands.h b/celt/bands.h > index c040c7f..61ae0cd 100644 > --- a/celt/bands.h > +++ b/celt/bands.h > @@ -41,7 +41,7 @@ > * @param X Spectrum > * @param bandE Square root of the energy for each band (returned) > */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM); > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch); > > /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const > opus_val16 *tonality, celt_ener *bandE);*/ > > diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c > index 5be7610..8af61d5 100644 > --- a/celt/celt_encoder.c > +++ b/celt/celt_encoder.c > @@ -1606,7 +1606,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > if (secondMdct) > { > compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); > amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); > for (i=0;i<C*nbEBands;i++) > bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); > @@ -1615,7 +1615,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, > st->arch); > if (CC==2&&C==1) > tf_chan = 0; > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); > > if (st->lfe) > { > @@ -1739,7 +1739,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > isTransient = 1; > shortBlocks = M; > compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, > st->upsample, st->arch); > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, > st->arch); > amp2Log2(mode, effEnd, end, bandE, bandLogE, C); > /* Compensate for the scaling of short vs long mdcts */ > for (i=0;i<C*nbEBands;i++) > diff --git a/celt/pitch.c b/celt/pitch.c > index bf46e7d..f944a33 100644 > --- a/celt/pitch.c > +++ b/celt/pitch.c > @@ -378,7 +378,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT > x_lp, opus_val16 * OPUS_RESTR > for (j=0;j<len>>1;j++) > sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); > #else > - sum = celt_inner_prod_c(x_lp, y+i, len>>1); > + sum = celt_inner_prod(x_lp, y+i, len>>1, arch); > #endif > xcorr[i] = MAX32(-1, sum); > #ifdef FIXED_POINT > diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_ > encoder.c > index c07132f..6ecea5d 100644 > --- a/src/opus_multistream_encoder.c > +++ b/src/opus_multistream_encoder.c > @@ -295,7 +295,7 @@ void surround_analysis(const CELTMode *celt_mode, > const void *pcm, opus_val16 *b > freq[i] = 0; > } > > - compute_band_energies(celt_mode, freq, bandE, 21, 1, LM); > + compute_band_energies(celt_mode, freq, bandE, 21, 1, LM, arch); > amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1); > /* Apply spreading function with -6 dB/band going up and -12 > dB/band going down. */ > for (i=1;i<21;i++) > -- > 2.8.0.rc3.226.g39d4020 > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20160912/0434e43f/attachment-0001.html> -------------- next part -------------- A non-text attachment was scrubbed... Name: opus-NEON-11-patches.zip Type: application/zip Size: 71375 bytes Desc: not available URL: <http://lists.xiph.org/pipermail/opus/attachments/20160912/0434e43f/attachment-0001.zip>