thr3ads.net - opus - [opus] [PATCH 2/5] Optimize fixed-point celt_fir

If this information is useful, please help other people find it:
Share via:

Linfeng Zhang

2016-Jul-14 00:48 UTC

[opus] Several patches of ARM NEON optimization

I rebased my previous 3 patches to the current master with minor changes.
Patches 1 to 3 replace all my previous submitted patches.
Patches 4 and 5 are new.

Thanks,
Linfeng Zhang

Linfeng Zhang

2016-Jul-14 00:48 UTC

head link

[opus] [PATCH 1/5] Revise celt_fir_c() to not pass in argument "mem"

The "mem" in celt_fir_c() either is contained in the head of input
"x" in
reverse order already, or can be easily attached to the head of "x"
before
calling the function. Removing argument "mem" can eliminate the
redundant buffer
copies inside.
Update celt_fir_sse4_1() accordingly.
---
 celt/celt_decoder.c        | 10 ++++-----
 celt/celt_lpc.c            | 33 +++++++++++-------------------
 celt/celt_lpc.h            |  5 ++---
 celt/x86/celt_lpc_sse.c    | 51 +++++++++-------------------------------------
 celt/x86/celt_lpc_sse.h    | 10 ++++-----
 celt/x86/x86_celt_map.c    |  1 -
 silk/LPC_analysis_filter.c |  6 +-----
 7 files changed, 34 insertions(+), 82 deletions(-)

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index b978bb3..f8433eb 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -509,7 +509,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st,
int N, int LM)
       opus_val16 fade = Q15ONE;
       int pitch_index;
       VARDECL(opus_val32, etmp);
-      VARDECL(opus_val16, exc);
+      VARDECL(opus_val16, _exc);
 
       if (loss_count == 0)
       {
@@ -520,7 +520,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st,
int N, int LM)
       }
 
       ALLOC(etmp, overlap, opus_val32);
-      ALLOC(exc, MAX_PERIOD, opus_val16);
+      ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+      opus_val16 *exc = _exc+LPC_ORDER;
       window = mode->window;
       c=0; do {
          opus_val16 decay;
@@ -568,15 +569,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT
st, int N, int LM)
          /* Initialize the LPC history with the samples just before the start
             of the region for which we're computing the excitation. */
          {
-            opus_val16 lpc_mem[LPC_ORDER];
             for (i=0;i<LPC_ORDER;i++)
             {
-               lpc_mem[i] +               exc[MAX_PERIOD-exc_length-1-i]       
ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT);
             }
             /* Compute the excitation for exc_length samples before the loss.
*/
             celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem,
st->arch);
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER,
st->arch);
          }
 
          /* Check if the waveform is decaying, and if so how fast.
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index b410a21..a7938af 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -89,56 +89,47 @@ int          p
 
 
 void celt_fir_c(
-         const opus_val16 *_x,
+         const opus_val16 *x,
          const opus_val16 *num,
-         opus_val16 *_y,
+         opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch)
 {
    int i,j;
    VARDECL(opus_val16, rnum);
-   VARDECL(opus_val16, x);
    SAVE_STACK;
 
    ALLOC(rnum, ord, opus_val16);
-   ALLOC(x, N+ord, opus_val16);
    for(i=0;i<ord;i++)
       rnum[i] = num[ord-i-1];
-   for(i=0;i<ord;i++)
-      x[i] = mem[ord-i-1];
-   for (i=0;i<N;i++)
-      x[i+ord]=_x[i];
-   for(i=0;i<ord;i++)
-      mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
    (void)arch;
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
       }
-      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+      y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
 #else
    for (i=0;i<N-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(rnum, x+i, sum, ord, arch);
-      _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0],
SIG_SHIFT)));
-      _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1],
SIG_SHIFT)));
-      _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2],
SIG_SHIFT)));
-      _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3],
SIG_SHIFT)));
+      xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+      y[i  ] = SATURATE16(ADD32(EXTEND32(x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
+      y[i+1] = SATURATE16(ADD32(EXTEND32(x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
+      y[i+2] = SATURATE16(ADD32(EXTEND32(x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
+      y[i+3] = SATURATE16(ADD32(EXTEND32(x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
    }
    for (;i<N;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<ord;j++)
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
-      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
    }
 #endif
    RESTORE_STACK;
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index 323459e..a4c5fd6 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -45,12 +45,11 @@ void celt_fir_c(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
 #if !defined(OVERRIDE_CELT_FIR)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
-    (celt_fir_c(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+    (celt_fir_c(x, num, y, N, ord, arch))
 #endif
 
 void celt_iir(const opus_val32 *x,
diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
index 67e5592..12a9b0e 100644
--- a/celt/x86/celt_lpc_sse.c
+++ b/celt/x86/celt_lpc_sse.c
@@ -40,63 +40,32 @@
 
 #if defined(FIXED_POINT)
 
-void celt_fir_sse4_1(const opus_val16 *_x,
+void celt_fir_sse4_1(const opus_val16 *x,
          const opus_val16 *num,
-         opus_val16 *_y,
+         opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch)
 {
     int i,j;
     VARDECL(opus_val16, rnum);
-    VARDECL(opus_val16, x);
 
     __m128i vecNoA;
     opus_int32 noA ;
     SAVE_STACK;
 
    ALLOC(rnum, ord, opus_val16);
-   ALLOC(x, N+ord, opus_val16);
    for(i=0;i<ord;i++)
       rnum[i] = num[ord-i-1];
-   for(i=0;i<ord;i++)
-      x[i] = mem[ord-i-1];
-
-   for (i=0;i<N-7;i+=8)
-   {
-       x[i+ord  ]=_x[i  ];
-       x[i+ord+1]=_x[i+1];
-       x[i+ord+2]=_x[i+2];
-       x[i+ord+3]=_x[i+3];
-       x[i+ord+4]=_x[i+4];
-       x[i+ord+5]=_x[i+5];
-       x[i+ord+6]=_x[i+6];
-       x[i+ord+7]=_x[i+7];
-   }
-
-   for (;i<N-3;i+=4)
-   {
-       x[i+ord  ]=_x[i  ];
-       x[i+ord+1]=_x[i+1];
-       x[i+ord+2]=_x[i+2];
-       x[i+ord+3]=_x[i+3];
-   }
-
-   for (;i<N;i++)
-         x[i+ord]=_x[i];
-
-   for(i=0;i<ord;i++)
-      mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
       }
-      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+      y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
 #else
    noA = EXTEND32(1) << SIG_SHIFT >> 1;
@@ -107,22 +76,22 @@ void celt_fir_sse4_1(const opus_val16 *_x,
       opus_val32 sums[4] = {0};
       __m128i vecSum, vecX;
 
-      xcorr_kernel(rnum, x+i, sums, ord, arch);
+      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
 
       vecSum = _mm_loadu_si128((__m128i *)sums);
       vecSum = _mm_add_epi32(vecSum, vecNoA);
       vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
-      vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+      vecX = OP_CVTEPI16_EPI32_M64(x + i);
       vecSum = _mm_add_epi32(vecSum, vecX);
       vecSum = _mm_packs_epi32(vecSum, vecSum);
-      _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+      _mm_storel_epi64((__m128i *)(y + i), vecSum);
    }
    for (;i<N;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<ord;j++)
-         sum = MAC16_16(sum, rnum[j], x[i + j]);
-      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+         sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
    }
 
 #endif
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index c5ec796..7d1ecf7 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -41,12 +41,11 @@ void celt_fir_sse4_1(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
-    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
 #else
 
@@ -56,11 +55,10 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
-#  define celt_fir(x, num, y, N, ord, mem, arch) \
-    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem,
arch))
+#  define celt_fir(x, num, y, N, ord, arch) \
+    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
 #endif
 #endif
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
index 47ba41b..5a1f5f9 100644
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -46,7 +46,6 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          opus_val16       *y,
          int              N,
          int              ord,
-         opus_val16       *mem,
          int              arch
 ) = {
   celt_fir_c,                /* non-sse */
diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c
index 2090667..5aeee4c 100644
--- a/silk/LPC_analysis_filter.c
+++ b/silk/LPC_analysis_filter.c
@@ -50,7 +50,6 @@ void silk_LPC_analysis_filter(
 {
     opus_int   j;
 #ifdef FIXED_POINT
-    opus_int16 mem[SILK_MAX_ORDER_LPC];
     opus_int16 num[SILK_MAX_ORDER_LPC];
 #else
     int ix;
@@ -67,10 +66,7 @@ void silk_LPC_analysis_filter(
     for ( j = 0; j < d; j++ ) {
         num[ j ] = -B[ j ];
     }
-    for (j=0;j<d;j++) {
-        mem[ j ] = in[ d - j - 1 ];
-    }
-    celt_fir( in + d, num, out + d, len - d, d, mem, arch );
+    celt_fir( in + d, num, out + d, len - d, d, arch );
     for ( j = 0; j < d; j++ ) {
         out[ j ] = 0;
     }
-- 
2.8.0.rc3.226.g39d4020

Linfeng Zhang

2016-Jul-14 00:48 UTC

head link

[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON

Create the fixed-point intrinsics optimization celt_fir_neon() for ARM NEON.
Create test tests/test_unit_optimization to unit test the optimization.
---
 .gitignore                              |   1 +
 Makefile.am                             |  39 ++++-
 celt/arm/arm_celt_map.c                 |  17 +++
 celt/arm/celt_lpc_arm.h                 |  65 ++++++++
 celt/arm/celt_lpc_neon_intr.c           | 254 ++++++++++++++++++++++++++++++++
 celt/celt_lpc.h                         |   5 +
 celt/tests/test_unit_dft.c              |   1 +
 celt/tests/test_unit_mathops.c          |   1 +
 celt/tests/test_unit_mdct.c             |   1 +
 celt/tests/test_unit_optimization_lpc.c |  96 ++++++++++++
 celt/tests/test_unit_rotation.c         |   1 +
 celt_headers.mk                         |   1 +
 celt_sources.mk                         |   1 +
 tests/test_unit_optimization.c          |  62 ++++++++
 14 files changed, 541 insertions(+), 4 deletions(-)
 create mode 100644 celt/arm/celt_lpc_arm.h
 create mode 100644 celt/arm/celt_lpc_neon_intr.c
 create mode 100644 celt/tests/test_unit_optimization_lpc.c
 create mode 100644 tests/test_unit_optimization.c

diff --git a/.gitignore b/.gitignore
index 33127c9..05d0582 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ tests/test_opus_api
 tests/test_opus_decode
 tests/test_opus_encode
 tests/test_opus_padding
+tests/test_unit_optimization
 celt/arm/armopts.s
 celt/dump_modes/dump_modes
 celt/tests/test_unit_cwrs32
diff --git a/Makefile.am b/Makefile.am
index 7a69114..2bfb923 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -84,9 +84,36 @@ pkginclude_HEADERS = include/opus.h
include/opus_multistream.h include/opus_type
 noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
 
 if EXTRA_PROGRAMS
-noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api
tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding
celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft
celt/tests/test_unit_entropy celt/tests/test_unit_laplace
celt/tests/test_unit_mathops celt/tests/test_unit_mdct
celt/tests/test_unit_rotation celt/tests/test_unit_types
-
-TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops
celt/tests/test_unit_entropy celt/tests/test_unit_laplace
celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_rotation
celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode
tests/test_opus_encode tests/test_opus_padding
+noinst_PROGRAMS = opus_demo \
+                  repacketizer_demo \
+                  opus_compare \
+                  celt/tests/test_unit_cwrs32 \
+                  celt/tests/test_unit_dft \
+                  celt/tests/test_unit_entropy \
+                  celt/tests/test_unit_laplace \
+                  celt/tests/test_unit_mathops \
+                  celt/tests/test_unit_mdct \
+                  celt/tests/test_unit_rotation \
+                  celt/tests/test_unit_types \
+                  tests/test_opus_api \
+                  tests/test_opus_encode \
+                  tests/test_opus_decode \
+                  tests/test_opus_padding \
+                  tests/test_unit_optimization
+
+TESTS = celt/tests/test_unit_types \
+        celt/tests/test_unit_mathops \
+        celt/tests/test_unit_entropy \
+        celt/tests/test_unit_laplace \
+        celt/tests/test_unit_dft \
+        celt/tests/test_unit_mdct \
+        celt/tests/test_unit_rotation \
+        celt/tests/test_unit_cwrs32 \
+        tests/test_opus_api \
+        tests/test_opus_decode \
+        tests/test_opus_encode \
+        tests/test_opus_padding \
+        tests/test_unit_optimization
 
 opus_demo_SOURCES = src/opus_demo.c
 
@@ -111,6 +138,9 @@ tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS)
$(LIBM)
 tests_test_opus_padding_SOURCES = tests/test_opus_padding.c
tests/test_opus_common.h
 tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
+tests_test_unit_optimization_SOURCES = tests/test_unit_optimization.c
+tests_test_unit_optimization_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
+
 celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
 celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
@@ -276,7 +306,8 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S):
$(top_srcdir)/celt/arm/arm2gnu.pl
 OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
-                    $(celt_tests_test_unit_dft_SOURCES:.c=.o)
+                    $(celt_tests_test_unit_dft_SOURCES:.c=.o) \
+                    $(tests_test_unit_optimization_SOURCES:.c=.o)
 
 if HAVE_SSE
 SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 4d4d069..74869ab 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -29,6 +29,7 @@
 #include "config.h"
 #endif
 
+#include "celt_lpc.h"
 #include "pitch.h"
 #include "kiss_fft.h"
 #include "mdct.h"
@@ -39,6 +40,22 @@
 #  if ((defined(OPUS_ARM_MAY_HAVE_NEON) &&
!defined(OPUS_ARM_PRESUME_NEON)) || \
     (defined(OPUS_ARM_MAY_HAVE_MEDIA) &&
!defined(OPUS_ARM_PRESUME_MEDIA)) || \
     (defined(OPUS_ARM_MAY_HAVE_EDSP) &&
!defined(OPUS_ARM_PRESUME_EDSP)))
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         int arch);
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val16 *, int, int, int) = {
+  celt_fir_c,             /* ARMv4 */
+  celt_fir_c,             /* EDSP */
+  celt_fir_c,             /* Media */
+  MAY_HAVE_NEON(celt_fir) /* NEON */
+};
+
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
     const opus_val16 *, opus_val32 *, int , int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
diff --git a/celt/arm/celt_lpc_arm.h b/celt/arm/celt_lpc_arm.h
new file mode 100644
index 0000000..101df3d
--- /dev/null
+++ b/celt/arm/celt_lpc_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(CELT_LPC_ARM_H)
+# define CELT_LPC_ARM_H
+
+# include "armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         int arch);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_CELT_FIR (1)
+#   define celt_fir(x, num, y, N, ord, arch) \
+  ((void)(arch),PRESUME_NEON(celt_fir)(x, num, y, N, ord, arch))
+#  endif
+
+#if !defined(OVERRIDE_CELT_FIR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val16 *, int, int, int);
+
+#  define OVERRIDE_CELT_FIR
+#   define celt_fir(x, num, y, N, ord, arch) \
+  ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, arch))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end CELT_LPC_ARM_H */
diff --git a/celt/arm/celt_lpc_neon_intr.c b/celt/arm/celt_lpc_neon_intr.c
new file mode 100644
index 0000000..4715d0b
--- /dev/null
+++ b/celt/arm/celt_lpc_neon_intr.c
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file celt_lpc_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt lpc functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         int arch)
+{
+   int i,j;
+   const int leftover = N & 7;
+   const opus_val16 *x = _x-ord;
+   VARDECL(opus_val16, rnum);
+   SAVE_STACK;
+   /* Extend rnum by 3 zeros to handle the case that (ord % 4) is non-zero. */
+   ALLOC(rnum, ord+3, opus_val16);
+   for (i=0;i<ord-3;i+=4)
+      vst1_s16(rnum+i, vrev64_s16(vld1_s16(num+ord-i-4)));
+   for (;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   rnum[ord] = rnum[ord+1] = rnum[ord+2] = 0;
+   (void)arch;
+
+#ifdef SMALL_FOOTPRINT
+   for (i=0;i<N-7;i+=8)
+   {
+      int16x8_t x_s16x8 = vld1q_s16(_x+i);
+      int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+      int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+      for (j=0;j<ord;j+=4)
+      {
+         const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+         x_s16x8 = vld1q_s16(x+i+j+0);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+         x_s16x8 = vld1q_s16(x+i+j+1);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+         x_s16x8 = vld1q_s16(x+i+j+2);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+         x_s16x8 = vld1q_s16(x+i+j+3);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+      }
+      vst1q_s16(y+i, vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT),
vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT)));
+   }
+   if (leftover)
+   {
+      if (leftover > 4)
+      {
+         int16x8_t x_s16x8 = vld1q_s16(_x+i);
+         int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+         int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            x_s16x8 = vld1q_s16(x+i+j+0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+            x_s16x8 = vld1q_s16(x+i+j+1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+            x_s16x8 = vld1q_s16(x+i+j+2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+            x_s16x8 = vld1q_s16(x+i+j+3);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+         }
+         const int16x8_t y_s16x8 = vcombine_s16(vqrshrn_n_s32(sum0_s32x4,
SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT));
+         vst1_s16(y+i, vget_low_s16(y_s16x8));
+         vst1q_lane_s16(y+i+4, y_s16x8, 4);
+         if (leftover >= 6)
+         {
+            vst1q_lane_s16(y+i+5, y_s16x8, 5);
+            if (leftover == 7)
+            {
+               vst1q_lane_s16(y+i+6, y_s16x8, 6);
+            }
+         }
+      }
+      else {
+         int32x4_t sum0_s32x4 = vshll_n_s16(vld1_s16(_x+i), SIG_SHIFT);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0),
rnum_s16x4, 0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1),
rnum_s16x4, 1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2),
rnum_s16x4, 2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3),
rnum_s16x4, 3);
+         }
+         const int16x4_t y_s16x4 = vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT);
+         if (leftover == 4)
+         {
+            vst1_s16(y+i, y_s16x4);
+         }
+         else
+         {
+            vst1_lane_s16(y+i, y_s16x4, 0);
+            if (leftover >= 2)
+            {
+               vst1_lane_s16(y+i+1, y_s16x4, 1);
+               if (leftover == 3)
+               {
+                  vst1_lane_s16(y+i+2, y_s16x4, 2);
+               }
+            }
+         }
+      }
+   }
+#else
+   for (i=0;i<N-7;i+=8)
+   {
+      int32x4_t sum0_s32x4, sum1_s32x4;
+      sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+      for (j=0;j<ord;j+=4)
+      {
+         const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+         int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+         x_s16x8 = vld1q_s16(x+i+j+1);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+         x_s16x8 = vld1q_s16(x+i+j+2);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+         x_s16x8 = vld1q_s16(x+i+j+3);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+      }
+      sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+      sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+      const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+      sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+      sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+      vst1q_s16(y+i, vcombine_s16(vqmovn_s32(sum0_s32x4),
vqmovn_s32(sum1_s32x4)));
+   }
+   if (leftover)
+   {
+      if (leftover > 4)
+      {
+         int32x4_t sum0_s32x4, sum1_s32x4;
+         sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+            x_s16x8 = vld1q_s16(x+i+j+1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+            x_s16x8 = vld1q_s16(x+i+j+2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+            x_s16x8 = vld1q_s16(x+i+j+3);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+         }
+         sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+         sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+         const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+         sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+         sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+         const int16x8_t y_s16x8 = vcombine_s16(vqmovn_s32(sum0_s32x4),
vqmovn_s32(sum1_s32x4));
+         vst1_s16(y+i, vget_low_s16(y_s16x8));
+         vst1q_lane_s16(y+i+4, y_s16x8, 4);
+         if (leftover >= 6)
+         {
+            vst1q_lane_s16(y+i+5, y_s16x8, 5);
+            if (leftover == 7)
+            {
+               vst1q_lane_s16(y+i+6, y_s16x8, 6);
+            }
+         }
+      }
+      else {
+         int32x4_t sum0_s32x4 = vdupq_n_s32(0);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0),
rnum_s16x4, 0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1),
rnum_s16x4, 1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2),
rnum_s16x4, 2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3),
rnum_s16x4, 3);
+         }
+         sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+         sum0_s32x4 = vaddw_s16(sum0_s32x4, vld1_s16(_x+i));
+         const int16x4_t y_s16x4 = vqmovn_s32(sum0_s32x4);
+         if (leftover == 4)
+         {
+            vst1_s16(y+i, y_s16x4);
+         }
+         else
+         {
+            vst1_lane_s16(y+i, y_s16x4, 0);
+            if (leftover >= 2)
+            {
+               vst1_lane_s16(y+i+1, y_s16x4, 1);
+               if (leftover == 3)
+               {
+                  vst1_lane_s16(y+i+2, y_s16x4, 2);
+               }
+            }
+         }
+      }
+   }
+#endif
+   RESTORE_STACK;
+}
+
+#endif
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index a4c5fd6..76a73c0 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -35,6 +35,11 @@
 #include "x86/celt_lpc_sse.h"
 #endif
 
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+   || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/celt_lpc_arm.h"
+#endif
+
 #define LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 6166eb0..582618e 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -52,6 +52,7 @@
 # include "celt_lpc.c"
 # include "pitch.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "mdct.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index fd3319d..da92f16 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -66,6 +66,7 @@
 #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 # include "arm/armcpu.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 8dbb9ca..0658c7a 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -53,6 +53,7 @@
 # include "pitch.c"
 # include "celt_lpc.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_optimization_lpc.c
b/celt/tests/test_unit_optimization_lpc.c
new file mode 100644
index 0000000..7247046
--- /dev/null
+++ b/celt/tests/test_unit_optimization_lpc.c
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+# define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#ifndef CELT_C
+# define CELT_C
+#endif
+#include "celt_lpc.h"
+#include "modes.h"
+
+#ifdef FIXED_POINT
+
+#define MAX_ORDER 32
+
+static int test_fir(int arch)
+{
+   opus_val16 x[MAX_PERIOD+MAX_ORDER];
+   opus_val16 num[MAX_ORDER];
+   opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD];
+   int N, ord;
+   unsigned int i;
+
+   printf("%50s", "celt_fir() ...");
+   for(ord=0;ord<=MAX_ORDER;ord++)
+   {
+      for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */
+      {
+         for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
+         {
+            x[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_PERIOD;++i)
+         {
+            yorg[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_ORDER;++i)
+         {
+            num[i] = (rand() % 32767) - 16384;
+         }
+         memcpy(yopt, yorg, sizeof(yorg));
+
+         celt_fir_c(x+MAX_ORDER, num, yorg, N, ord, arch);
+         celt_fir  (x+MAX_ORDER, num, yopt, N, ord, arch);
+         if (memcmp(yorg, yopt, sizeof(yorg)))
+         {
+            printf("ord=%2d N=%3d failed!\nError in lpc unit
test!!!\n", ord, N);
+            for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++)
+            {
+               if (yorg[i] != yopt[i])
+               {
+                  printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i],
i, yopt[i]);
+               }
+            }
+            return -1;
+         }
+      }
+   }
+   printf(" passed!\n");
+   return 0;
+}
+#endif /* FIXED_POINT */
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 1080c20..3a85a29 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -64,6 +64,7 @@
 #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 # include "arm/armcpu.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
diff --git a/celt_headers.mk b/celt_headers.mk
index c9df94b..36ae290 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -34,6 +34,7 @@ celt/static_modes_fixed.h \
 celt/static_modes_float_arm_ne10.h \
 celt/static_modes_fixed_arm_ne10.h \
 celt/arm/armcpu.h \
+celt/arm/celt_lpc_arm.h \
 celt/arm/fixed_armv4.h \
 celt/arm/fixed_armv5e.h \
 celt/arm/fixed_arm64.h \
diff --git a/celt_sources.mk b/celt_sources.mk
index 2ffe99a..37c0129 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -37,6 +37,7 @@ CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
 
 CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_lpc_neon_intr.c \
 celt/arm/celt_neon_intr.c
 
 CELT_SOURCES_ARM_NE10= \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
new file mode 100644
index 0000000..7eeab38
--- /dev/null
+++ b/tests/test_unit_optimization.c
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "stack_alloc.h"
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#ifdef FIXED_POINT
+
+# include "celt/tests/test_unit_optimization_lpc.c"
+
+#endif
+
+int main(void)
+{
+   int result = 0; /* 0: passed; other: failed */
+   ALLOC_STACK;
+#ifdef FIXED_POINT
+   int arch = opus_select_arch();
+#endif /* FIXED_POINT */
+   int count = 10;
+
+   while (!result && count--) {
+      printf("\n--------------------------- Testing optimization
---------------------------\n");
+#ifdef FIXED_POINT
+      result |= test_fir(arch);
+#endif /* FIXED_POINT */
+   }
+   return result;
+}
-- 
2.8.0.rc3.226.g39d4020

Linfeng Zhang

2016-Jul-14 00:49 UTC

head link

[opus] [PATCH 3/5] Optimize silk_warped_autocorrelation_FIX() for ARM NEON

Create silk_warped_autocorrelation_FIX_c_opt() which unrolls and parallelizes
input by 8. It has very long prolog and epilog, but this is the cost to get
good speed on this heavily hit function. This function may be the code base for
optimization on different CPUs.
Create ARM NEON intrinsics optimization silk_warped_autocorrelation_FIX_neon().
Create unit test silk/tests/test_unit_optimization_warped_autocorrelation.c.
---
 Makefile.am                                        |   5 +-
 silk/arm/arm_silk_map.c                            |  20 +
 silk/fixed/arm/warped_autocorrelation_FIX_arm.h    |  65 +++
 .../arm/warped_autocorrelation_FIX_neon_intr.c     | 495 +++++++++++++++++++++
 silk/fixed/main_FIX.h                              |  15 +-
 .../fixed/mips/warped_autocorrelation_FIX_mipsr1.h |   6 -
 silk/fixed/warped_autocorrelation_FIX.c            |   7 +-
 ...test_unit_optimization_warped_autocorrelation.c | 441 ++++++++++++++++++
 silk_headers.mk                                    |   1 +
 silk_sources.mk                                    |   3 +
 tests/test_unit_optimization.c                     |   2 +
 11 files changed, 1046 insertions(+), 14 deletions(-)
 create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_arm.h
 create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
 create mode 100644 silk/tests/test_unit_optimization_warped_autocorrelation.c

diff --git a/Makefile.am b/Makefile.am
index 2bfb923..c66fb2d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,6 +50,7 @@ SILK_SOURCES += $(SILK_SOURCES_ARM)
 if HAVE_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
 SILK_SOURCES += $(SILK_SOURCES_ARM_NEON_INTR)
+SILK_SOURCES += $(SILK_SOURCES_FIXED_ARM_NEON_INTR)
 endif
 
 if HAVE_ARM_NE10
@@ -327,7 +328,9 @@ $(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_X86_SSE4_1_CFLAGS)
 endif
 
 if HAVE_ARM_NEON_INTR
-ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo)
$(SILK_SOURCES_ARM_NEON_INTR:.c=.lo)
+ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
+                    $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) \
+                    $(SILK_SOURCES_FIXED_ARM_NEON_INTR:.c=.lo)
 $(ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \
  $(OPUS_ARM_NEON_INTR_CFLAGS)  $(NE10_CFLAGS)
 endif
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 9bd86a7..2e330c4 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE.
 # include "config.h"
 #endif
 
+#include "main_FIX.h"
 #include "NSQ.h"
 
 #if defined(OPUS_HAVE_RTCD)
@@ -52,4 +53,23 @@ opus_int32
 
 # endif
 
+#if defined(FIXED_POINT) && \
+ defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
+          opus_int32                *corr,                                  /*
O    Result [order + 1]                                                         
*/
+          opus_int                  *scale,                                 /*
O    Scaling of the correlation vector                                          
*/
+    const opus_int16                *input,                                 /*
I    Input data to correlate                                                    
*/
+    const opus_int                  warping_Q16,                            /*
I    Warping coefficient                                                        
*/
+    const opus_int                  length,                                 /*
I    Length of input                                                            
*/
+    const opus_int                  order                                   /*
I    Correlation order (even)                                                   
*/
+) = {
+      silk_warped_autocorrelation_FIX_c,              /* ARMv4 */
+      silk_warped_autocorrelation_FIX_c,              /* EDSP */
+      silk_warped_autocorrelation_FIX_c,              /* Media */
+      MAY_HAVE_NEON(silk_warped_autocorrelation_FIX), /* Neon */
+};
+
+#endif
+
 #endif /* OPUS_HAVE_RTCD */
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_arm.h
b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h
new file mode 100644
index 0000000..ee892bf
--- /dev/null
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(WARPED_AUTOCORRELATION_FIX_ARM_H)
+# define WARPED_AUTOCORRELATION_FIX_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+void silk_warped_autocorrelation_FIX_neon(
+          opus_int32                *corr,                                  /*
O    Result [order + 1]                                                         
*/
+          opus_int                  *scale,                                 /*
O    Scaling of the correlation vector                                          
*/
+    const opus_int16                *input,                                 /*
I    Input data to correlate                                                    
*/
+    const opus_int                  warping_Q16,                            /*
I    Warping coefficient                                                        
*/
+    const opus_int                  length,                                 /*
I    Length of input                                                            
*/
+    const opus_int                  order                                   /*
I    Correlation order (even)                                                   
*/
+);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_warped_autocorrelation_FIX (1)
+#   define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16,
length, order) \
+  ((void)(arch),PRESUME_NEON(silk_warped_autocorrelation_FIX)(corr, scale,
input, warping_Q16, length, order))
+#  endif
+
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const
SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK+1])(opus_int32*, opus_int*,
const opus_int16*, const opus_int, const opus_int, const opus_int);
+
+#  define OVERRIDE_silk_warped_autocorrelation_FIX
+#   define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16,
length, order) \
+  ((*SILK_WARPED_AUTOCORRELATION_FIX_IMPL[(arch)&OPUS_ARCHMASK])(corr,
scale, input, warping_Q16, length, order))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end WARPED_AUTOCORRELATION_FIX_ARM_H */
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
new file mode 100644
index 0000000..80dd949
--- /dev/null
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -0,0 +1,495 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file warped_autocorrelation_FIX_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for silk
silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+#define NUM_PARALLEL_INPUTS 8
+
+void silk_warped_autocorrelation_FIX_neon(
+         opus_int32                *corr,                                  /* O
Result [order + 1]                                                          */
+         opus_int                  *scale,                                 /* O
Scaling of the correlation vector                                           */
+   const opus_int16                *input,                                 /* I
Input data to correlate                                                     */
+   const opus_int                  warping_Q16,                            /* I
Warping coefficient                                                         */
+   const opus_int                  length,                                 /* I
Length of input                                                             */
+   const opus_int                  order                                   /* I
Correlation order (even)                                                    */
+)
+{
+   opus_int   n = 0, i, lsh;
+   opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two
extra entries.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in
the last prolog and the last inner loop, and one extra end entry in the last
prolog.
+   opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+   /* Order must be even */
+   silk_assert( ( order & 1 ) == 0 );
+   silk_assert( 2 * QS - QC >= 0 );
+
+   /* Loop over samples */
+   if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+      const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+      for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n +=
NUM_PARALLEL_INPUTS ) {
+         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4,
tmp2_QS1_s32x4;
+         int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2,
corr_QC3_s64x2;
+         int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+         int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
+         int64x1_t corr_QC_s64x1;
+         const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n),    
QS);
+         const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4),
QS);
+         vst1q_s32(tmp1_QS,      input_QS0_s32x4);
+         vst1q_s32(tmp1_QS  + 4, input_QS1_s32x4);
+
+         /* Loop over allpass sections */
+
+         /* -------------------- prolog 0 -------------------- */
+
+         tmp1_QS_s32x2  = vget_low_s32(input_QS0_s32x4);
+         tmp2_QS_s32x2  = vld1_s32(state_QS + order); // Accessed one extra end
entry.
+         vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+         corr_QC_s64x1  = vld1_s64(corr_QC + order);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_low_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
+         vst1_s64(corr_QC + order, corr_QC_s64x1);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 1),
tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+
+         /* -------------------- prolog 1 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + order - 1);
+         vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_low_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 2),
tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS0_s32x4));
+
+         /* -------------------- prolog 2 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one
extra end entry.
+         vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4);  // Saving one extra
entry is OK.
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t1_s64x2));
+         vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3),
tmp1_QS0_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
+
+         /* -------------------- prolog 3 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
+         vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4),
tmp1_QS0_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vget_low_s32(input_QS1_s32x4);
+
+         /* -------------------- prolog 4 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
+         tmp2_QS_s32x2  = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
+         vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+         vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32
(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t2_s64x2));
+         vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5),
tmp1_QS0_s32x4);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 1),
tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+
+         /* -------------------- prolog 5 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
+         tmp2_QS_s32x2  = vld1_s32 (state_QS + order - 1);
+         vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
+         vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32
(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6),
tmp1_QS0_s32x4);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 2),
tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS1_s32x4));
+
+         /* -------------------- prolog 6 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one
extra end entry.
+         vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
+         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra
entry is OK.
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t3_s64x2));
+         vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7),
tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3),
tmp1_QS1_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
+
+         /* -------------------- kernel loop -------------------- */
+
+         for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+             /* Output of allpass section */
+            tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS + 1);
+            tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS + 5);
+            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1,
tmp1_QS0_s32x4);
+            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5,
tmp1_QS1_s32x4);
+            corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 1);
+            corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 3);
+            corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 5);
+            corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 7);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
vget_low_s32 (input_QS0_s32x4));
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
vget_low_s32 (input_QS1_s32x4));
+            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+            t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+            corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+            corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+            corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1,
corr_QC0_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3,
corr_QC1_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5,
corr_QC2_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7,
corr_QC3_s64x2);
+            tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS),     tmp1_QS0_s32x4); // Accessed one extra head entry in
the last loop.
+            tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
+            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
+            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
+            tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
+            tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
+            tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+            tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         }
+
+         /* -------------------- epilog 0 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
+         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra entry is
OK.
+         vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
+         corr_QC_s64x1  = vld1_s64 (corr_QC);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t0_s64x2));
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS),    
vget_high_s32(tmp1_QS0_s32x4));
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 1 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32 (state_QS);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
+         vst1_s32 (state_QS,     tmp1_QS_s32x2);
+         vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); //
Accessed one extra head entry.
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 2 -------------------- */
+
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
+         vst1_lane_s32(state_QS,     tmp1_QS_s32x2, 1);
+         vst1q_s32    (state_QS + 1, tmp1_QS1_s32x4);
+         corr_QC_s64x1  = vld1_s64(corr_QC);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t1_s64x2));
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 3 -------------------- */
+
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS);
+         vst1q_s32(state_QS, tmp1_QS1_s32x4);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1q_s64(corr_QC,     corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4);
// Accessed one extra head entry.
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 4 -------------------- */
+
+         corr_QC_s64x1  = vld1_s64 (corr_QC);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t2_s64x2));
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC,     corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
+         vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5),
vget_high_s32(tmp1_QS1_s32x4));
+         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+         /* -------------------- epilog 5 & 6 -------------------- */
+
+         vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+         tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2,
vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
+         tmp2_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
+         tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4),
tmp2_QS_s32x2);
+         vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+         corr_QC3_s64x2 = vld1q_s64(corr_QC);
+         t3_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS1_s32x4));
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
+         t3_s64x2       = vmull_s32(tmp2_QS_s32x2,
vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)),
32)));
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC3_s64x2),
vget_low_s64(t3_s64x2));
+         vst1_s64(corr_QC, corr_QC_s64x1);
+      }
+   }
+
+   for( ; n < length; n++ ) {
+      input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS
);
+      /* Loop over allpass sections */
+      for( i = 0; i <= order; i++ ) {
+         /* Output of allpass section */
+         tmp2_QS[ 0 ]           = silk_SMLAWB( state_QS[ order - i ], state_QS[
order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+         state_QS[ order - i ]  = tmp1_QS[ 0 ];
+         corr_QC[  order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]           = tmp2_QS[ 0 ];
+      }
+   }
+   lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+   lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+   *scale = -( QC + lsh );
+   silk_assert( *scale >= -30 && *scale <= 12 );
+   const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+   for( i = 0; i <= order - 3; i += 4 ) {
+      int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+      int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+      corr_QC0_s64x2           = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+      corr_QC1_s64x2           = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+      int32x4_t corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2),
vmovn_s64(corr_QC0_s64x2));
+      corr_s32x4               = vrev64q_s32(corr_s32x4);
+      vst1q_s32(corr + order - i - 3, corr_s32x4);
+   }
+   if( lsh >= 0 ) {
+      for( ; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64(
corr_QC[ i ], lsh ) );
+      }
+   } else {
+      for( ; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64(
corr_QC[ i ], -lsh ) );
+      }
+   }
+   silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h
index 375b5eb..2abb5d9 100644
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -36,6 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "debug.h"
 #include "entenc.h"
 
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+   || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "fixed/arm/warped_autocorrelation_FIX_arm.h"
+#endif
+
 #ifndef FORCE_CPP_BUILD
 #ifdef __cplusplus
 extern "C"
@@ -47,6 +52,9 @@ extern "C"
 #define silk_encode_do_VAD_Fxx      silk_encode_do_VAD_FIX
 #define silk_encode_frame_Fxx       silk_encode_frame_FIX
 
+#define QC  10
+#define QS  14
+
 /*********************/
 /* Encoder Functions */
 /*********************/
@@ -121,7 +129,7 @@ void silk_noise_shape_analysis_FIX(
 );
 
 /* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
           opus_int32                *corr,                                  /*
O    Result [order + 1]                                                         
*/
           opus_int                  *scale,                                 /*
O    Scaling of the correlation vector                                          
*/
     const opus_int16                *input,                                 /*
I    Input data to correlate                                                    
*/
@@ -130,6 +138,11 @@ void silk_warped_autocorrelation_FIX(
     const opus_int                  order                                   /*
I    Correlation order (even)                                                   
*/
 );
 
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+#define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16,
length, order) \
+    (silk_warped_autocorrelation_FIX_c(corr, scale, input, warping_Q16, length,
order))
+#endif
+
 /* Calculation of LTP state scaling */
 void silk_LTP_scale_ctrl_FIX(
     silk_encoder_state_FIX          *psEnc,                                 /*
I/O  encoder state                                                              
*/
diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
index e803ef0..6916940 100644
--- a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
+++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
@@ -34,12 +34,6 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#undef QC
-#define QC  10
-
-#undef QS
-#define QS  14
-
 /* Autocorrelations for a warped frequency axis */
 #define OVERRIDE_silk_warped_autocorrelation_FIX
 void silk_warped_autocorrelation_FIX(
diff --git a/silk/fixed/warped_autocorrelation_FIX.c
b/silk/fixed/warped_autocorrelation_FIX.c
index 6ca6c11..994c299 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -31,17 +31,13 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#define QC  10
-#define QS  14
-
 #if defined(MIPSr1_ASM)
 #include "mips/warped_autocorrelation_FIX_mipsr1.h"
 #endif
 
 
-#ifndef OVERRIDE_silk_warped_autocorrelation_FIX
 /* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
           opus_int32                *corr,                                  /*
O    Result [order + 1]                                                         
*/
           opus_int                  *scale,                                 /*
O    Scaling of the correlation vector                                          
*/
     const opus_int16                *input,                                 /*
I    Input data to correlate                                                    
*/
@@ -92,4 +88,3 @@ void silk_warped_autocorrelation_FIX(
     }
     silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
 }
-#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */
diff --git a/silk/tests/test_unit_optimization_warped_autocorrelation.c
b/silk/tests/test_unit_optimization_warped_autocorrelation.c
new file mode 100644
index 0000000..b7d0ad0
--- /dev/null
+++ b/silk/tests/test_unit_optimization_warped_autocorrelation.c
@@ -0,0 +1,441 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+/* Unrolling the input loop by 8 is about 25% faster than unrolling by 4 on
Chromebook with an ARMv7 Processor. */
+#define NUM_PARALLEL_INPUTS 8
+
+/* Keep this function here because it is the code base to optimize on different
CPUs. */
+void silk_warped_autocorrelation_FIX_c_opt(
+         opus_int32                *corr,                                  /* O
Result [order + 1]                                                          */
+         opus_int                  *scale,                                 /* O
Scaling of the correlation vector                                           */
+   const opus_int16                *input,                                 /* I
Input data to correlate                                                     */
+   const opus_int                  warping_Q16,                            /* I
Warping coefficient                                                         */
+   const opus_int                  length,                                 /* I
Length of input                                                             */
+   const opus_int                  order                                   /* I
Correlation order (even)                                                    */
+)
+{
+   opus_int   n = 0, i, j, lsh;
+   opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 2 ] = { 0 }; // Create one
extra entry.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in
the last prolog and the last inner loop.
+   opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+   /* Order must be even */
+   silk_assert( ( order & 1 ) == 0 );
+   silk_assert( 2 * QS - QC >= 0 );
+
+   /* Loop over samples */
+   if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+      for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n +=
NUM_PARALLEL_INPUTS ) {
+         for( i = 0; i < NUM_PARALLEL_INPUTS; i++ ) {
+            input_QS[i] = tmp1_QS[i] = silk_LSHIFT32( (opus_int32)input[ n + i
], QS );
+         }
+
+         /* Loop over allpass sections */
+
+         /* -------------------- prolog 0 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order ];
+         state_QS[ order ]     = tmp1_QS[ 0 ];
+         corr_QC[ order ]     += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 1
] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 1 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 1 ]          = state_QS[ order     ];
+
+         state_QS[ order - 1 ] = tmp1_QS[ 0 ];
+         state_QS[ order     ] = tmp1_QS[ 1 ];
+
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order ]     += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ],
input_QS[ 1 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 1
] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 2
] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 2 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 2 ]          = state_QS[ order     ];
+
+         state_QS[ order - 2 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 1 ];
+         state_QS[ order     ] = tmp1_QS[ 2 ];
+
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ],
input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ],
input_QS[ 2 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 1
] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 2
] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 3
] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 3 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 3 ]          = state_QS[ order     ];
+
+         state_QS[ order - 3 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 2 ];
+         state_QS[ order     ] = tmp1_QS[ 3 ];
+
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ],
input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ],
input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ],
input_QS[ 3 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 1
] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 2
] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 3
] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 4
] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 4 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 4 ]          = state_QS[ order     ];
+
+         state_QS[ order - 4 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 3 ];
+         state_QS[ order     ] = tmp1_QS[ 4 ];
+
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ],
input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ],
input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ],
input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ],
input_QS[ 4 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 1
] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 2
] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 3
] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 4
] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 5
] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 5 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 5 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 4 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 5 ]          = state_QS[ order     ];
+
+         state_QS[ order - 5 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 4 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 3 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 4 ];
+         state_QS[ order     ] = tmp1_QS[ 5 ];
+
+         corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ],
input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ],
input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ],
input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ],
input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ],
input_QS[ 5 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 5 ]          = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 1
] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 2
] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 3
] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 4
] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 5
] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 6
] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 6 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 6 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 5 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 4 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 5 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 6 ]          = state_QS[ order     ];
+
+         state_QS[ order - 6 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 5 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 4 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 3 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 4 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 5 ];
+         state_QS[ order     ] = tmp1_QS[ 6 ];
+
+         corr_QC[ order - 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ],
input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ],
input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ],
input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ],
input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ],
input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ],
input_QS[ 6 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 6 ]          = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ order - 1
] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]          = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 2
] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 3
] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 4
] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 5
] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 6
] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 7
] - tmp1_QS[ 0 ], warping_Q16 ); // Accessed one extra head entry when order is
6.
+
+         /* -------------------- kernel loop -------------------- */
+
+         for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+            /* Output of allpass section */
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               tmp2_QS[ j ] = state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j
];
+            }
+
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] = tmp1_QS[ j
];
+            }
+
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               corr_QC[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] +=
silk_RSHIFT64( silk_SMULL( tmp1_QS[ j ], input_QS[ j ] ), 2 * QS - QC );
+            }
+
+            for( j = NUM_PARALLEL_INPUTS - 1; j >= 0; j-- ) {
+               tmp1_QS[ j ] = silk_SMLAWB( tmp2_QS[ j ], state_QS[ order - i -
NUM_PARALLEL_INPUTS + j ] - tmp1_QS[ j ], warping_Q16 ); // Accessed one extra
head entry in the last loop.
+            }
+         }
+
+         /* -------------------- epilog 0 -------------------- */
+
+         tmp2_QS[ 2 ]  = state_QS[ 1 ];
+         tmp2_QS[ 3 ]  = state_QS[ 2 ];
+         tmp2_QS[ 4 ]  = state_QS[ 3 ];
+         tmp2_QS[ 5 ]  = state_QS[ 4 ];
+         tmp2_QS[ 6 ]  = state_QS[ 5 ];
+         tmp2_QS[ 7 ]  = state_QS[ 6 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 1 ];
+         state_QS[ 1 ] = tmp1_QS[ 2 ];
+         state_QS[ 2 ] = tmp1_QS[ 3 ];
+         state_QS[ 3 ] = tmp1_QS[ 4 ];
+         state_QS[ 4 ] = tmp1_QS[ 5 ];
+         state_QS[ 5 ] = tmp1_QS[ 6 ];
+         state_QS[ 6 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ]
), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ]
), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ]
), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ]
), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ]
), 2 * QS - QC );
+         corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ]
), 2 * QS - QC );
+         corr_QC[ 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ]
), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 5 ] - tmp1_QS[ 7
], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 4 ] - tmp1_QS[ 6
], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 3 ] - tmp1_QS[ 5
], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 2 ] - tmp1_QS[ 4
], warping_Q16 );
+         tmp1_QS[ 3 ]  = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 1 ] - tmp1_QS[ 3
], warping_Q16 );
+         tmp1_QS[ 2 ]  = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ 0 ] - tmp1_QS[ 2
], warping_Q16 );
+
+         /* -------------------- epilog 1 -------------------- */
+
+         tmp2_QS[ 3 ]  = state_QS[ 1 ];
+         tmp2_QS[ 4 ]  = state_QS[ 2 ];
+         tmp2_QS[ 5 ]  = state_QS[ 3 ];
+         tmp2_QS[ 6 ]  = state_QS[ 4 ];
+         tmp2_QS[ 7 ]  = state_QS[ 5 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 2 ];
+         state_QS[ 1 ] = tmp1_QS[ 3 ];
+         state_QS[ 2 ] = tmp1_QS[ 4 ];
+         state_QS[ 3 ] = tmp1_QS[ 5 ];
+         state_QS[ 4 ] = tmp1_QS[ 6 ];
+         state_QS[ 5 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ]
), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ]
), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ]
), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ]
), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ]
), 2 * QS - QC );
+         corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ]
), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 4 ] - tmp1_QS[ 7
], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 3 ] - tmp1_QS[ 6
], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 2 ] - tmp1_QS[ 5
], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 1 ] - tmp1_QS[ 4
], warping_Q16 );
+         tmp1_QS[ 3 ]  = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 0 ] - tmp1_QS[ 3
], warping_Q16 );
+
+         /* -------------------- epilog 2 -------------------- */
+
+         tmp2_QS[ 4 ]  = state_QS[ 1 ];
+         tmp2_QS[ 5 ]  = state_QS[ 2 ];
+         tmp2_QS[ 6 ]  = state_QS[ 3 ];
+         tmp2_QS[ 7 ]  = state_QS[ 4 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 3 ];
+         state_QS[ 1 ] = tmp1_QS[ 4 ];
+         state_QS[ 2 ] = tmp1_QS[ 5 ];
+         state_QS[ 3 ] = tmp1_QS[ 6 ];
+         state_QS[ 4 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ]
), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ]
), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ]
), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ]
), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ]
), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 3 ] - tmp1_QS[ 7
], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 2 ] - tmp1_QS[ 6
], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 1 ] - tmp1_QS[ 5
], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 0 ] - tmp1_QS[ 4
], warping_Q16 );
+
+         /* -------------------- epilog 3 -------------------- */
+
+         tmp2_QS[ 5 ]  = state_QS[ 1 ];
+         tmp2_QS[ 6 ]  = state_QS[ 2 ];
+         tmp2_QS[ 7 ]  = state_QS[ 3 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 4 ];
+         state_QS[ 1 ] = tmp1_QS[ 5 ];
+         state_QS[ 2 ] = tmp1_QS[ 6 ];
+         state_QS[ 3 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ]
), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ]
), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ]
), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ]
), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 2 ] - tmp1_QS[ 7
], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 1 ] - tmp1_QS[ 6
], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 0 ] - tmp1_QS[ 5
], warping_Q16 );
+
+         /* -------------------- epilog 4 -------------------- */
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ]
), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ]
), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ]
), 2 * QS - QC );
+
+         opus_int32 tmp1_QS_2 = silk_SMLAWB( state_QS[ 1 ], tmp1_QS[ 5 ] -
tmp1_QS[ 6 ], warping_Q16 );
+         state_QS[ 1 ]        = silk_SMLAWB( state_QS[ 2 ], tmp1_QS[ 6 ] -
tmp1_QS[ 7 ], warping_Q16 );
+
+         /* -------------------- epilog 5 & 6 -------------------- */
+
+         state_QS[ 0 ] = silk_SMLAWB( tmp1_QS[ 6 ], tmp1_QS_2 - state_QS[ 1 ],
warping_Q16 );
+         state_QS[ 2 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS_2,     input_QS[ 6
] ), 2 * QS - QC )
+               +         silk_RSHIFT64( silk_SMULL( state_QS[ 0 ], input_QS[ 7
] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( state_QS[ 1 ], input_QS[ 7
] ), 2 * QS - QC );
+      }
+   }
+
+   for( ; n < length; n++ ) {
+      input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS
);
+      /* Loop over allpass sections */
+      for( i = 0; i <= order; i++ ) {
+         /* Output of allpass section */
+         tmp2_QS[ 0 ]           = silk_SMLAWB( state_QS[ order - i ], state_QS[
order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+         state_QS[ order - i ]  = tmp1_QS[ 0 ];
+         corr_QC[  order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ],
input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]          = tmp2_QS[ 0 ];
+      }
+   }
+   lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+   lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+   *scale = -( QC + lsh );
+   silk_assert( *scale >= -30 && *scale <= 12 );
+   if( lsh >= 0 ) {
+      for( i = 0; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64(
corr_QC[ i ], lsh ) );
+      }
+   } else {
+      for( i = 0; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64(
corr_QC[ i ], -lsh ) );
+      }
+   }
+   silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#define MAX_LENGTH 360
+
+static int test_warped_autocorrelation(int arch)
+{
+   unsigned int i;
+   opus_int32 corrOrg[MAX_SHAPE_LPC_ORDER + 1], corrOpt[MAX_SHAPE_LPC_ORDER +
1];
+   opus_int   scaleOrg, scaleOpt;
+   opus_int16 input[MAX_LENGTH];
+   opus_int   warping_Q16, length, order;
+   (void)arch;
+
+   printf("%50s", "silk_warped_autocorrelation_FIX() ...");
+   for( order = 0; order <= MAX_SHAPE_LPC_ORDER; order += 2 ) // order must
be even.
+   {
+      for( length = 0; length <= MAX_LENGTH; length++ )
+      {
+         for (i=0;i<MAX_LENGTH;++i)
+         {
+            input[i] = (rand() % 32767) - 16384;
+         }
+         warping_Q16 = rand() % 32767;
+         memcpy(corrOpt, corrOrg, sizeof(corrOrg));
+
+         silk_warped_autocorrelation_FIX_c(corrOrg, &scaleOrg, input,
warping_Q16, length, order);
+         silk_warped_autocorrelation_FIX  (corrOpt, &scaleOpt, input,
warping_Q16, length, order);
+         if (memcmp(corrOpt, corrOrg, sizeof(corrOrg)))
+         {
+            printf("order=%2d length=%3d failed!\n", order, length);
+            for (i=0;i<sizeof(corrOrg) / sizeof(*corrOrg);i++)
+            {
+               if (corrOrg[i] != corrOpt[i])
+               {
+                  printf("\ncorrOrg[%3d]=%12d, corrOpt[%3d]=%12d", i,
corrOrg[i], i, corrOpt[i]);
+               }
+            }
+            printf("\n");
+            return -1;
+         }
+      }
+   }
+   printf(" passed!\n");
+   return 0;
+}
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index f8bf1d2..52c42d0 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -30,6 +30,7 @@ silk/arm/SigProc_FIX_armv5e.h \
 silk/arm/NSQ_neon.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
+silk/fixed/arm/warped_autocorrelation_FIX_arm.h \
 silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
 silk/fixed/mips/prefilter_FIX_mipsr1.h \
 silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 7229ee3..5f9551b 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -117,6 +117,9 @@ SILK_SOURCES_FIXED_SSE4_1 =
silk/fixed/x86/vector_ops_FIX_sse.c \
 silk/fixed/x86/burg_modified_FIX_sse.c \
 silk/fixed/x86/prefilter_FIX_sse.c
 
+SILK_SOURCES_FIXED_ARM_NEON_INTR = \
+silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+
 SILK_SOURCES_FLOAT = \
 silk/float/apply_sine_window_FLP.c \
 silk/float/corrMatrix_FLP.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 7eeab38..b5c25d9 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -40,6 +40,7 @@
 #ifdef FIXED_POINT
 
 # include "celt/tests/test_unit_optimization_lpc.c"
+# include
"silk/tests/test_unit_optimization_warped_autocorrelation.c"
 
 #endif
 
@@ -56,6 +57,7 @@ int main(void)
       printf("\n--------------------------- Testing optimization
---------------------------\n");
 #ifdef FIXED_POINT
       result |= test_fir(arch);
+      result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
    }
    return result;
-- 
2.8.0.rc3.226.g39d4020

Linfeng Zhang

2016-Jul-14 00:49 UTC

head link

[opus] [PATCH 4/5] Refactor silk_warped_autocorrelation_FIX_neon()

Clean the code by defining macros.
---
 .../arm/warped_autocorrelation_FIX_neon_intr.c     | 637 ++++++++++-----------
 1 file changed, 287 insertions(+), 350 deletions(-)

diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
index 80dd949..6071445 100644
--- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -40,7 +40,6 @@
 #endif
 
 #include <stdio.h>
-
 #include <arm_neon.h>
 #include "stack_alloc.h"
 #include "main_FIX.h"
@@ -49,6 +48,190 @@
 
 #define NUM_PARALLEL_INPUTS 8
 
+#define vget_all(x) (x)
+
+/* Calculate 1 or 2 elements of corr_QC and tmp1_QS in prolog. */
+#define CORRELATION_PROLOG_1_OR_2(                                            
\
+   corr_QC,           /* I/O  corr_QC buffer. Updated 1 or 2 elements.    */  
\
+   state_QS,          /* I    state_QS buffer.                            */  
\
+   offset,            /* I    The address offset of corr_QC and state_QS. */  
\
+   input_QS0_s32x4,   /* I    Input_QS elements 0 to 3.                   */  
\
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.    */  
\
+   tmp1_QS_s32x2,     /* I/O  Either 1 or 2 elements of tmp1_QS.          */  
\
+   tmp2_QS_s32x2,     /* I    Either 1 or 2 elements of tmp2_QS.          */  
\
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.              */  
\
+   vget_X,            /*      Either vget_low_s64 or vget_all.            */  
\
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.               */  
\
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.               */  
\
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.               */  
\
+)                                                                             
\
+{                                                                             
\
+   int64xX_t corr_QC_s64xX;                                                   
\
+   int64x2_t t_s64x2;                                                         
\
+   corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset));                           
\
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));   
\
+   t_s64x2       = vshrq_n_s64(t_s64x2, 2 * QS - QC);                         
\
+   corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2));               
\
+   (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX);                            
\
+   tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2);
\
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);               
\
+   tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16);                                  
\
+   tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);                    
\
+}
+
+/* Calculate 3 or 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */
+#define CORRELATION_PROLOG_3_OR_4(                                             
\
+   corr_QC,           /* I/O  corr_QC buffer. Updated 3 or 4 elements.    */   
\
+   state_QS,          /* I/O  state_QS buffer. Updated 4 elements.        */   
\
+   offset,            /* I    The address offset of corr_QC and state_QS. */   
\
+   input_QS0_s32x4,   /* I    Input_QS elements 0 to 3.                   */   
\
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.    */   
\
+   tmp1_QS0_s32x4,    /* O    Updated 3 or 4 elements of tmp1_QS.         */   
\
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.              */   
\
+   vget_X,            /*      Either vget_low_s64 or vget_all.            */   
\
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.               */   
\
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.               */   
\
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.               */   
\
+)                                                                              
\
+{                                                                              
\
+   int32x4_t tmp2_QS_s32x4;                                                    
\
+   int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2;                               
\
+   int64xX_t corr_QC_s64xX;                                                    
\
+   tmp2_QS_s32x4  = vld1q_s32(state_QS + (offset));                            
\
+   vst1q_s32(state_QS + (offset), tmp1_QS0_s32x4);                             
\
+   corr_QC0_s64x2 = vld1q_s64  (corr_QC + (offset));                           
\
+   corr_QC_s64xX  = (vld1X_s64)(corr_QC + (offset) + 2);                       
\
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4)); \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4)); \
+   t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);                        
\
+   t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);                        
\
+   corr_QC0_s64x2 = vaddq_s64  (corr_QC0_s64x2, t0_s64x2);                     
\
+   corr_QC_s64xX  = (vaddX_s64)(corr_QC_s64xX,  vget_X(t1_s64x2));             
\
+   vst1q_s64  (corr_QC + (offset),     corr_QC0_s64x2);                        
\
+   (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX);                         
\
+   tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1),
tmp1_QS0_s32x4);            \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32x2);              \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32x2);              \
+   tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));       \
+   tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS_s32x4);                  
\
+}
+
+/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog and kernel
loop. */
+#define CORRELATION_4(offset) CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS,
offset, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x2_t, vget_all,
vld1q_s64, vst1q_s64, vaddq_s64)
+
+/* Calculate 3 or 4 elements of corr_QC and tmp1_QS. */
+#define CORRELATION_NEXT_3_OR_4(                                               
\
+   corr_QC,           /* I/O  corr_QC buffer. Updated 3 or 4 elements.    */   
\
+   state_QS,          /* I    state_QS buffer.                            */   
\
+   offset,            /* I    The address offset of corr_QC and state_QS. */   
\
+   input_QS1_s32x4,   /* I    4 elements of input_QS.                     */   
\
+   tmp1_QS1_s32x4,    /* I/O  Either 3 or 4 elements of tmp1_QS.          */   
\
+   tmp2_QS1_s32x4,    /* I    Either 3 or 4 elements of tmp2_QS.          */   
\
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.    */   
\
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.              */   
\
+   vget_X,            /*      Either vget_low_s64 or vget_all.            */   
\
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.               */   
\
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.               */   
\
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.               */   
\
+)                                                                              
\
+{                                                                              
\
+   int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2;                               
\
+   int64xX_t corr_QC_s64xX;                                                    
\
+   corr_QC0_s64x2 = vld1q_s64  (corr_QC + (offset));                           
\
+   corr_QC_s64xX  = (vld1X_s64)(corr_QC + (offset) + 2);                       
\
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4)); \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4)); \
+   t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);                        
\
+   t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);                        
\
+   corr_QC0_s64x2 = vaddq_s64  (corr_QC0_s64x2, t0_s64x2);                     
\
+   corr_QC_s64xX  = (vaddX_s64)(corr_QC_s64xX,  vget_X(t1_s64x2));             
\
+   vst1q_s64  (corr_QC + (offset),     corr_QC0_s64x2);                        
\
+   (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX);                         
\
+   tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1),
tmp1_QS1_s32x4);            \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32x2);              \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32x2);              \
+   tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));       \
+   tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);                 
\
+}
+
+/* Calculate 1 or 2 elements of corr_QC and tmp1_QS. */
+#define CORRELATION_EXTRA_1_OR_2(                                              
\
+   corr_QC,           /* I/O  corr_QC buffer. Updated 1 or 2 elements.         
*/ \
+   state_QS,          /* I    state_QS buffer.                                 
*/ \
+   offset,            /* I    The address offset of corr_QC and state_QS.      
*/ \
+   input_QS_s32x2,    /* I    2 elements of input_QS.                          
*/ \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.         
*/ \
+   tmp1_QS_s32x2X,    /* I    Either tmp1_QS_s32x2 or high half of
tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \
+   tmp2_QS_s32x2,     /* I    Either 1 or 2 elements of tmp2_QS.               
*/ \
+   tmp1_QS_s32x2,     /* O    Updated 1 or 2 elements of tmp1_QS.              
*/ \
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.                   
*/ \
+   vget_X,            /*      Either vget_low_s64 or vget_all.                 
*/ \
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.                    
*/ \
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.                    
*/ \
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.                    
*/ \
+)                                                                              
\
+{                                                                              
\
+   int64xX_t corr_QC_s64xX;                                                    
\
+   int64x2_t t_s64x2;                                                          
\
+   corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset));                            
\
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2X, input_QS_s32x2);                  
\
+   t_s64x2       = vshrq_n_s64(t_s64x2, 2 * QS - QC);                          
\
+   corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2));                
\
+   (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX);                             
\
+   tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2X);
\
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);                
\
+   tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16);                                   
\
+   tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);                     
\
+}
+
+/* Calculate 1 element of corr_QC. */
+#define CORRELATION_EPILOG_1(                                                  
\
+   corr_QC,         /* I/O  corr_QC buffer. Updated 1 element.                 
*/ \
+   input_QS0_s32x4, /* I    4 elements of input_QS.                            
*/ \
+   tmp1_QS_s32xX,   /* I    Either tmp1_QS_s32x2 or low half of tmp1_QS0_s32x4,
with 1 or 2 elements of tmp1_QS. */ \
+   vget_X           /*      The splitting instruction, either vget_low_s32 or
vget_high_s32.                     */ \
+)                                                                              
\
+{                                                                              
\
+   int64x1_t corr_s64x1;                                                       
\
+   int64x2_t t_s64x2;                                                          
\
+   corr_s64x1 = vld1_s64(corr_QC);                                             
\
+   t_s64x2    = vmull_s32(tmp1_QS_s32xX, (vget_X)(input_QS0_s32x4));           
\
+   t_s64x2    = vshrq_n_s64(t_s64x2, 2 * QS - QC);                             
\
+   corr_s64x1 = vadd_s64(corr_s64x1, vget_high_s64(t_s64x2));                  
\
+   vst1_s64(corr_QC, corr_s64x1);                                              
\
+}
+
+/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */
+#define CORRELATION_EPILOG_4(                                                  
\
+   corr_QC,           /* I/O  corr_QC buffer. Updated 4 elements.              
*/ \
+   state_QS,          /* I/O  state_QS buffer. Updated 4 elements.             
*/ \
+   offset,            /* I    The address offset of corr_QC and state_QS.      
*/ \
+   input_QS1_s32x4,   /* I    Input_QS elements 4 to 7.                        
*/ \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.         
*/ \
+   tmp1_QS1_s32x4     /* I/O  4 elements of tmp1_QS.                           
*/ \
+ )                                                                             
\
+ {                                                                             
\
+   int32x4_t tmp2_QS_s32x4;                                                    
\
+   int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, t0_s64x2, t1_s64x2;               
\
+   tmp2_QS_s32x4  = vld1q_s32(state_QS + (offset));                            
\
+   vst1q_s32(state_QS + (offset), tmp1_QS1_s32x4);                             
\
+   corr_QC0_s64x2 = vld1q_s64(corr_QC + (offset));                             
\
+   corr_QC1_s64x2 = vld1q_s64(corr_QC + (offset) + 2);                         
\
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4)); \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4)); \
+   t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);                        
\
+   t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);                        
\
+   corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);                       
\
+   corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);                       
\
+   vst1q_s64(corr_QC + (offset),     corr_QC0_s64x2);                          
\
+   vst1q_s64(corr_QC + (offset) + 2, corr_QC1_s64x2);                          
\
+   tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1),
tmp1_QS1_s32x4);            \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32x2);              \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32x2);              \
+   tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));       \
+   tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS_s32x4);                  
\
+}
+
 void silk_warped_autocorrelation_FIX_neon(
          opus_int32                *corr,                                  /* O
Result [order + 1]                                                          */
          opus_int                  *scale,                                 /* O
Scaling of the correlation vector                                           */
@@ -61,9 +244,10 @@ void silk_warped_autocorrelation_FIX_neon(
    opus_int   n = 0, i, lsh;
    opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
    opus_int32 input_QS[NUM_PARALLEL_INPUTS];
-   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two
extra entries.
-   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in
the last prolog and the last inner loop, and one extra end entry in the last
prolog.
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two
extra elements.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head element
in the last prolog and the last inner loop, and one extra end element in the
last prolog.
    opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+   int64x2_t  lsh_s64x2;
 
    /* Order must be even */
    silk_assert( ( order & 1 ) == 0 );
@@ -71,387 +255,138 @@ void silk_warped_autocorrelation_FIX_neon(
 
    /* Loop over samples */
    if( order >= NUM_PARALLEL_INPUTS - 2 ) {
-      const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+      const int32x2_t warping_Q16_s32x2 = vdup_n_s32(warping_Q16);
       for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n +=
NUM_PARALLEL_INPUTS ) {
-         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4,
tmp2_QS1_s32x4;
-         int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2,
corr_QC3_s64x2;
-         int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4;
          int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
-         int64x1_t corr_QC_s64x1;
          const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n),    
QS);
          const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4),
QS);
-         vst1q_s32(tmp1_QS,      input_QS0_s32x4);
-         vst1q_s32(tmp1_QS  + 4, input_QS1_s32x4);
+         vst1q_s32(tmp1_QS,     input_QS0_s32x4);
+         vst1q_s32(tmp1_QS + 4, input_QS1_s32x4);
 
          /* Loop over allpass sections */
 
          /* -------------------- prolog 0 -------------------- */
-
-         tmp1_QS_s32x2  = vget_low_s32(input_QS0_s32x4);
-         tmp2_QS_s32x2  = vld1_s32(state_QS + order); // Accessed one extra end
entry.
+         tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end
element.
          vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
-         corr_QC_s64x1  = vld1_s64(corr_QC + order);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_low_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
-         vst1_s64(corr_QC + order, corr_QC_s64x1);
-         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 1),
tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
-         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
-         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+         CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 0,
input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x1_t,
vget_low_s64, vld1_s64,  vst1_s64,  vadd_s64)
+         tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
 
          /* -------------------- prolog 1 -------------------- */
-
-         tmp2_QS_s32x2  = vld1_s32(state_QS + order - 1);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
          vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_low_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
-         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 2),
tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
-         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 1,
input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x2_t,
vget_all,     vld1q_s64, vst1q_s64, vaddq_s64)
          tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS0_s32x4));
 
          /* -------------------- prolog 2 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one
extra end entry.
-         vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4);  // Saving one extra
entry is OK.
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
-         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t1_s64x2));
-         vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
-         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3),
tmp1_QS0_s32x4);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         // Accessed one extra end element of state_QS.
+         // Saving one extra element of state_QS is OK.
+         CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, order - 2,
input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x1_t, vget_low_s64,
vld1_s64, vst1_s64, vadd_s64)
          tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
 
          /* -------------------- prolog 3 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
-         vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4),
tmp1_QS0_s32x4);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS_s32x2  = vget_low_s32(input_QS1_s32x4);
+         CORRELATION_4(order - 3)
+         tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4);
 
          /* -------------------- prolog 4 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
-         tmp2_QS_s32x2  = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
-         vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+         tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
          vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
-         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32
(input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t2_s64x2));
-         vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
-         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5),
tmp1_QS0_s32x4);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 1),
tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
-         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+         CORRELATION_4(order - 4)
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order,
vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2,
tmp1_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
+         tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
 
          /* -------------------- prolog 5 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
-         tmp2_QS_s32x2  = vld1_s32 (state_QS + order - 1);
-         vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
-         vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32
(input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6),
tmp1_QS0_s32x4);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 2),
tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
+         vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+         CORRELATION_4(order - 5)
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order - 1,
vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2,
tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
          tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS1_s32x4));
 
          /* -------------------- prolog 6 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one
extra end entry.
-         vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
-         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra
entry is OK.
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
-         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t3_s64x2));
-         vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
-         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7),
tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3),
tmp1_QS1_s32x4);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one
extra end element of state_QS.
+         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra
element of state_QS is OK.
+         // Accessed one extra head element when order is 6.
+         CORRELATION_4(order - 6)
+         CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - 2, input_QS1_s32x4,
tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x1_t, vget_low_s64,
vld1_s64, vst1_s64, vadd_s64)
          tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
 
          /* -------------------- kernel loop -------------------- */
-
          for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
-             /* Output of allpass section */
-            tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS + 1);
+            /* Output of allpass section */
+            // Accessed one extra head element of state_QS in the last loop.
             tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS + 5);
-            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1,
tmp1_QS0_s32x4);
             vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5,
tmp1_QS1_s32x4);
-            corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 1);
-            corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 3);
-            corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 5);
-            corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i -
NUM_PARALLEL_INPUTS + 7);
-            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
vget_low_s32 (input_QS0_s32x4));
-            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
vget_low_s32 (input_QS1_s32x4));
-            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-            t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-            t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-            corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-            corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-            corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1,
corr_QC0_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3,
corr_QC1_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5,
corr_QC2_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7,
corr_QC3_s64x2);
-            tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS),     tmp1_QS0_s32x4); // Accessed one extra head entry in
the last loop.
-            tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i -
NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
-            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4),
warping_Q16_s32);
-            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
warping_Q16_s32);
-            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
-            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
-            tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16),
vshrn_n_s64(t1_s64x2, 16));
-            tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
-            tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-            tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+            CORRELATION_4(order - i - NUM_PARALLEL_INPUTS + 1)
+            CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - i -
NUM_PARALLEL_INPUTS + 5, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4,
warping_Q16_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
          }
 
          /* -------------------- epilog 0 -------------------- */
-
-         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
-         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra entry is
OK.
-         vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
-         corr_QC_s64x1  = vld1_s64 (corr_QC);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32
(input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4),
vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t0_s64x2));
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
-         vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS),    
vget_high_s32(tmp1_QS0_s32x4));
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
+         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra element
is OK.
+         CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4,
vget_low_s32(tmp1_QS0_s32x4), vget_low_s32)
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 1,
vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2,
vget_high_s32(tmp1_QS0_s32x4), tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t,
vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 3, input_QS1_s32x4,
warping_Q16_s32x2, tmp1_QS1_s32x4)
 
          /* -------------------- epilog 1 -------------------- */
-
-         tmp2_QS_s32x2  = vld1_s32 (state_QS);
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
-         vst1_s32 (state_QS,     tmp1_QS_s32x2);
-         vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); //
Accessed one extra head entry.
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
warping_Q16_s32);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp2_QS_s32x2 = vld1_s32(state_QS);
+         vst1_s32(state_QS, tmp1_QS_s32x2);
+         // Accessed one extra head element of state_QS.
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 0,
vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2,
tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 2, input_QS1_s32x4,
warping_Q16_s32x2, tmp1_QS1_s32x4)
 
          /* -------------------- epilog 2 -------------------- */
-
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
-         vst1_lane_s32(state_QS,     tmp1_QS_s32x2, 1);
-         vst1q_s32    (state_QS + 1, tmp1_QS1_s32x4);
-         corr_QC_s64x1  = vld1_s64(corr_QC);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                
vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t1_s64x2));
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
-         vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1);
+         CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, tmp1_QS_s32x2,
vget_high_s32)
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 1, input_QS1_s32x4,
warping_Q16_s32x2, tmp1_QS1_s32x4)
 
          /* -------------------- epilog 3 -------------------- */
-
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS);
-         vst1q_s32(state_QS, tmp1_QS1_s32x4);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1q_s64(corr_QC,     corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4);
// Accessed one extra head entry.
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
warping_Q16_s32);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16),
vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
-
-         /* -------------------- epilog 4 -------------------- */
-
-         corr_QC_s64x1  = vld1_s64 (corr_QC);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32
(input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t2_s64x2));
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64 (corr_QC,     corr_QC_s64x1);
-         vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
-         vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
-
-         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
-         tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5),
vget_high_s32(tmp1_QS1_s32x4));
-         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
-         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
-         vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
-
-         /* -------------------- epilog 5 & 6 -------------------- */
-
-         vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
-         tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2,
vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
-         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
-         tmp2_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
-         tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4),
tmp2_QS_s32x2);
-         vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
-
-         corr_QC3_s64x2 = vld1q_s64(corr_QC);
-         t3_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS1_s32x4));
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
-         t3_s64x2       = vmull_s32(tmp2_QS_s32x2,
vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)),
32)));
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC3_s64x2),
vget_low_s64(t3_s64x2));
-         vst1_s64(corr_QC, corr_QC_s64x1);
+         // Accessed one extra head element of state_QS.
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 0, input_QS1_s32x4,
warping_Q16_s32x2, tmp1_QS1_s32x4)
+
+         {
+            int64x1_t corr_QC_s64x1;
+            int64x2_t corr_QC0_s64x2;
+            int64x2_t t0_s64x2, t1_s64x2;
+
+            /* -------------------- epilog 4 -------------------- */
+            corr_QC_s64x1  = vld1_s64 (corr_QC);
+            corr_QC0_s64x2 = vld1q_s64(corr_QC + 1);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4),
vget_low_s32 (input_QS1_s32x4));
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4),
vget_high_s32(input_QS1_s32x4));
+            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2));
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2);
+            vst1_s64 (corr_QC,     corr_QC_s64x1);
+            vst1q_s64(corr_QC + 1, corr_QC0_s64x2);
+            vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+            tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+            tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5),
vget_high_s32(tmp1_QS1_s32x4));
+            t1_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);
+            tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+            tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+            vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+            /* -------------------- epilog 5 & 6 -------------------- */
+            vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+            tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2,
vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+            t1_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32x2);
+            tmp2_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+            tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4),
tmp2_QS_s32x2);
+            vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+            corr_QC0_s64x2 = vld1q_s64(corr_QC);
+            t1_s64x2       = vmull_s32(tmp1_QS_s32x2,
vget_high_s32(input_QS1_s32x4));
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2);
+            vst1_s64(corr_QC + 1, vget_high_s64(corr_QC0_s64x2));
+            t1_s64x2       = vmull_s32(tmp2_QS_s32x2,
vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)),
32)));
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC0_s64x2),
vget_low_s64(t1_s64x2));
+            vst1_s64(corr_QC, corr_QC_s64x1);
+         }
       }
    }
 
@@ -470,14 +405,16 @@ void silk_warped_autocorrelation_FIX_neon(
    lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
    *scale = -( QC + lsh );
    silk_assert( *scale >= -30 && *scale <= 12 );
-   const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+   lsh_s64x2 = vdupq_n_s64(lsh);
    for( i = 0; i <= order - 3; i += 4 ) {
-      int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
-      int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
-      corr_QC0_s64x2           = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
-      corr_QC1_s64x2           = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
-      int32x4_t corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2),
vmovn_s64(corr_QC0_s64x2));
-      corr_s32x4               = vrev64q_s32(corr_s32x4);
+      int32x4_t corr_s32x4;
+      int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
+      corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+      corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+      corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+      corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+      corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2),
vmovn_s64(corr_QC0_s64x2));
+      corr_s32x4     = vrev64q_s32(corr_s32x4);
       vst1q_s32(corr + order - i - 3, corr_s32x4);
    }
    if( lsh >= 0 ) {
-- 
2.8.0.rc3.226.g39d4020

Linfeng Zhang

2016-Jul-14 00:49 UTC

head link

[opus] [PATCH 5/5] Optimize silk/LPC_inv_pred_gain.c for ARM NEON

Optimized LPC_inverse_pred_gain_QA(), silk_LPC_inverse_pred_gain() and
silk_LPC_inverse_pred_gain_Q24() for ARM NEON.
Created corresponding unit test.
---
 silk/CNG.c                                         |   2 +-
 silk/LPC_inv_pred_gain.c                           |  18 +-
 silk/NLSF2A.c                                      |   3 +-
 silk/SigProc_FIX.h                                 |  19 +-
 silk/arm/LPC_inv_pred_gain_arm.h                   |  84 +++++++
 silk/arm/LPC_inv_pred_gain_neon_intr.c             | 258 +++++++++++++++++++++
 silk/arm/arm_silk_map.c                            |  24 +-
 silk/decode_parameters.c                           |   4 +-
 silk/fixed/find_LPC_FIX.c                          |   2 +-
 silk/float/find_LPC_FLP.c                          |   2 +-
 silk/float/main_FLP.h                              |   3 +-
 silk/float/wrappers_FLP.c                          |   5 +-
 silk/init_decoder.c                                |   1 +
 silk/process_NLSFs.c                               |   4 +-
 silk/structs.h                                     |   1 +
 .../test_unit_optimization_LPC_inv_pred_gain.c     | 107 +++++++++
 silk_headers.mk                                    |   1 +
 silk_sources.mk                                    |   1 +
 tests/test_unit_optimization.c                     |   9 +-
 19 files changed, 523 insertions(+), 25 deletions(-)
 create mode 100644 silk/arm/LPC_inv_pred_gain_arm.h
 create mode 100644 silk/arm/LPC_inv_pred_gain_neon_intr.c
 create mode 100644 silk/tests/test_unit_optimization_LPC_inv_pred_gain.c

diff --git a/silk/CNG.c b/silk/CNG.c
index 8443ad6..78d500a 100644
--- a/silk/CNG.c
+++ b/silk/CNG.c
@@ -142,7 +142,7 @@ void silk_CNG(
         silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14,
length, &psCNG->rand_seed );
 
         /* Convert CNG NLSF to filter representation */
-        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );
+        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order,
psDec->arch );
 
         /* Generate CNG signal, by synthesis filtering */
         silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER *
sizeof( opus_int32 ) );
diff --git a/silk/LPC_inv_pred_gain.c b/silk/LPC_inv_pred_gain.c
index 4af89aa..64747ad 100644
--- a/silk/LPC_inv_pred_gain.c
+++ b/silk/LPC_inv_pred_gain.c
@@ -36,9 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #define MUL32_FRAC_Q(a32, b32, Q)  
((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))
 
-/* Compute inverse of LPC prediction gain, and                          */
-/* test if LPC coefficients are stable (all poles within unit circle)   */
-static opus_int32 LPC_inverse_pred_gain_QA(                 /* O   Returns
inverse prediction gain in energy domain, Q30    */
+/* Compute inverse of LPC prediction gain, and                                 
*/
+/* test if LPC coefficients are stable (all poles within unit circle)          
*/
+/* Note that specific platforms' optimizations don't guarantee
identical A_QA buffer.           */
+/* Since the intermediate A_QA buffer is never used again in the caller
functions, that's fine. */
+static opus_int32 LPC_inverse_pred_gain_QA_c(               /* O   Returns
inverse prediction gain in energy domain, Q30    */
     opus_int32           A_QA[ 2 ][ SILK_MAX_ORDER_LPC ],   /* I   Prediction
coefficients                                  */
     const opus_int       order                              /* I   Prediction
order                                         */
 )
@@ -106,7 +108,7 @@ static opus_int32 LPC_inverse_pred_gain_QA(                
/* O   Returns inver
 }
 
 /* For input in Q12 domain */
-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse
prediction gain in energy domain, Q30        */
+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse
prediction gain in energy domain, Q30        */
     const opus_int16            *A_Q12,             /* I   Prediction
coefficients, Q12 [order]                         */
     const opus_int              order               /* I   Prediction order    
*/
 )
@@ -127,13 +129,14 @@ opus_int32 silk_LPC_inverse_pred_gain(              /* O  
Returns inverse predi
     if( DC_resp >= 4096 ) {
         return 0;
     }
-    return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+    return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
+    /* Don't use Atmp_QA buffer anymore from here, because specific
platforms' optimizations don't guarantee identical values. */
 }
 
 #ifdef FIXED_POINT
 
 /* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O    Returns inverse
prediction gain in energy domain, Q30       */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_c(        /* O    Returns inverse
prediction gain in energy domain, Q30       */
     const opus_int32            *A_Q24,             /* I    Prediction
coefficients [order]                             */
     const opus_int              order               /* I    Prediction order   
*/
 )
@@ -149,6 +152,7 @@ opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O   
Returns inverse pred
         Anew_QA[ k ] = silk_RSHIFT32( A_Q24[ k ], 24 - QA );
     }
 
-    return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+    return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
+    /* Don't use Atmp_QA buffer anymore from here, because specific
platforms' optimizations don't guarantee identical values. */
 }
 #endif
diff --git a/silk/NLSF2A.c b/silk/NLSF2A.c
index b1c559e..a259212 100644
--- a/silk/NLSF2A.c
+++ b/silk/NLSF2A.c
@@ -66,7 +66,8 @@ static OPUS_INLINE void silk_NLSF2A_find_poly(
 void silk_NLSF2A(
     opus_int16                  *a_Q12,             /* O    monic whitening
filter coefficients in Q12,  [ d ]          */
     const opus_int16            *NLSF,              /* I    normalized line
spectral frequencies in Q15, [ d ]          */
-    const opus_int              d                   /* I    filter order
(should be even)                               */
+    const opus_int              d,                  /* I    filter order
(should be even)                               */
+    int                         arch                /* I    Run-time
architecture                                       */
 )
 {
     /* This ordering was found to maximize quality. It improves numerical
accuracy of
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index b632994..570ae11 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -47,6 +47,10 @@ extern "C"
 #include "x86/SigProc_FIX_sse.h"
 #endif
 
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/LPC_inv_pred_gain_arm.h"
+#endif
+
 /********************************************************************/
 /*                    SIGNAL PROCESSING FUNCTIONS                   */
 /********************************************************************/
@@ -132,13 +136,13 @@ void silk_bwexpander_32(
 
 /* Compute inverse of LPC prediction gain, and                           */
 /* test if LPC coefficients are stable (all poles within unit circle)    */
-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse
prediction gain in energy domain, Q30        */
+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse
prediction gain in energy domain, Q30        */
     const opus_int16            *A_Q12,             /* I   Prediction
coefficients, Q12 [order]                         */
     const opus_int              order               /* I   Prediction order    
*/
 );
 
 /* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O    Returns inverse
prediction gain in energy domain, Q30       */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_c(        /* O    Returns inverse
prediction gain in energy domain, Q30       */
     const opus_int32            *A_Q24,             /* I    Prediction
coefficients [order]                             */
     const opus_int              order               /* I    Prediction order   
*/
 );
@@ -152,6 +156,14 @@ void silk_ana_filt_bank_1(
     const opus_int32            N                   /* I    Number of input
samples                                     */
 );
 
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+#define silk_LPC_inverse_pred_gain(A_Q12, order)    
((void)(arch),silk_LPC_inverse_pred_gain_c(A_Q12, order))
+#endif
+
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24)
+#define silk_LPC_inverse_pred_gain_Q24(A_Q24, order)
((void)(arch),silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order))
+#endif
+
 /********************************************************************/
 /*                        SCALAR FUNCTIONS                          */
 /********************************************************************/
@@ -271,7 +283,8 @@ void silk_A2NLSF(
 void silk_NLSF2A(
     opus_int16                  *a_Q12,             /* O    monic whitening
filter coefficients in Q12,  [ d ]          */
     const opus_int16            *NLSF,              /* I    normalized line
spectral frequencies in Q15, [ d ]          */
-    const opus_int              d                   /* I    filter order
(should be even)                               */
+    const opus_int              d,                  /* I    filter order
(should be even)                               */
+    int                         arch                /* I    Run-time
architecture                                       */
 );
 
 void silk_insertion_sort_increasing(
diff --git a/silk/arm/LPC_inv_pred_gain_arm.h b/silk/arm/LPC_inv_pred_gain_arm.h
new file mode 100644
index 0000000..77d7167
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_arm.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(LPC_INV_PRED_GAIN_ARM_H)
+# define LPC_INV_PRED_GAIN_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse
prediction gain in energy domain, Q30        */
+    const opus_int16            *A_Q12,             /* I   Prediction
coefficients, Q12 [order]                         */
+    const opus_int              order               /* I   Prediction order    
*/
+);
+# endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_LPC_inverse_pred_gain      (1)
+#   define silk_LPC_inverse_pred_gain(A_Q12, order)
((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order))
+#  endif
+
+#  if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+/*Is run-time CPU detection enabled on this platform?*/
+#   if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
&& !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const
SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const
opus_int order);
+#    define OVERRIDE_silk_LPC_inverse_pred_gain      (1)
+#    define silk_LPC_inverse_pred_gain(A_Q12, order)
((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order))
+#   elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#    define OVERRIDE_silk_LPC_inverse_pred_gain      (1)
+#    define silk_LPC_inverse_pred_gain(A_Q12, order)
((void)(arch),silk_LPC_inverse_pred_gain_neon(A_Q12, order))
+#   endif
+#  endif
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_int32 silk_LPC_inverse_pred_gain_Q24_neon(     /* O    Returns inverse
prediction gain in energy domain, Q30       */
+    const opus_int32            *A_Q24,             /* I    Prediction
coefficients [order]                             */
+    const opus_int              order               /* I    Prediction order   
*/
+);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_LPC_inverse_pred_gain_Q24      (1)
+#   define silk_LPC_inverse_pred_gain_Q24(A_Q24, order)
((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain_Q24)(A_Q24, order))
+#  endif
+
+#  if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24)
+/*Is run-time CPU detection enabled on this platform?*/
+#    if defined(OPUS_HAVE_RTCD) &&
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const
SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK+1])(const opus_int32 *A_Q24,
const opus_int order);
+#     define OVERRIDE_silk_LPC_inverse_pred_gain_Q24      (1)
+#     define silk_LPC_inverse_pred_gain_Q24(A_Q24, order)
((*SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[(arch)&OPUS_ARCHMASK])(A_Q24, order))
+#    elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#     define OVERRIDE_silk_LPC_inverse_pred_gain_Q24      (1)
+#     define silk_LPC_inverse_pred_gain_Q24(A_Q24, order)
((void)(arch),silk_LPC_inverse_pred_gain_Q24_neon(A_Q24, order))
+#    endif
+#  endif
+
+# endif /* end FIXED_POINT */
+
+#endif /* end LPC_INV_PRED_GAIN_ARM_H */
diff --git a/silk/arm/LPC_inv_pred_gain_neon_intr.c
b/silk/arm/LPC_inv_pred_gain_neon_intr.c
new file mode 100644
index 0000000..29f0e57
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file warped_autocorrelation_FIX_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for silk
silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#define QA                          24
+#define A_LIMIT                     SILK_FIX_CONST( 0.99975, QA )
+
+/* Compute inverse of LPC prediction gain, and                                 
*/
+/* test if LPC coefficients are stable (all poles within unit circle)          
*/
+/* Note that this NEON optimization doesn't guarantee identical A_QA
buffer.                    */
+/* Since the intermediate A_QA buffer is never used again in the caller
functions, that's fine. */
+static opus_int32 LPC_inverse_pred_gain_QA_neon(            /* O   Returns
inverse prediction gain in energy domain, Q30    */
+    opus_int32           A_QA[ 2 ][ SILK_MAX_ORDER_LPC ],   /* I   Prediction
coefficients                                  */
+    const opus_int       order                              /* I   Prediction
order                                         */
+)
+{
+    opus_int   k, n, mult2Q;
+    opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2;
+    opus_int32 *Aold_QA, *Anew_QA;
+
+    Anew_QA = A_QA[ order & 1 ];
+
+    invGain_Q30 = (opus_int32)1 << 30;
+    for( k = order - 1; k > 0; k-- ) {
+        int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;
+        int64x2_t mult2Q_s64x2;
+
+        /* Check for stability */
+        if( ( Anew_QA[ k ] > A_LIMIT ) || ( Anew_QA[ k ] < -A_LIMIT ) ) {
+            return 0;
+        }
+
+        /* Set RC equal to negated AR coef */
+        rc_Q31 = -silk_LSHIFT( Anew_QA[ k ], 31 - QA );
+
+        /* rc_mult1_Q30 range: [ 1 : 2^30 ] */
+        rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31,
rc_Q31 );
+        silk_assert( rc_mult1_Q30 > ( 1 << 15 ) );                  
/* reduce A_LIMIT if fails */
+        silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );
+
+        /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */
+        mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );
+        rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );
+
+        /* Update inverse gain */
+        /* invGain_Q30 range: [ 0 : 2^30 ] */
+        invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2
);
+        silk_assert( invGain_Q30 >= 0           );
+        silk_assert( invGain_Q30 <= ( 1 << 30 ) );
+
+        /* Swap pointers */
+        Aold_QA = Anew_QA;
+        Anew_QA = A_QA[ k & 1 ];
+
+        /* Update AR coefficient */
+        rc_Q31_s32x2   = vdup_n_s32(rc_Q31);
+        mult2Q_s64x2   = vdupq_n_s64(-mult2Q);
+        rc_mult2_s32x2 = vdup_n_s32(rc_mult2);
+
+        for( n = 0; n < k; n += 4 ) {
+            /* We always calculate extra elements of A_QA buffer when (k % 4)
!= 0, to take the advantage of SIMD parallelization. */
+            int32x4_t Aold_QA_s32x4, Aold_QAr_s32x4, t_s32x4, tmp_QA_s32x4;
+            int64x2_t tmp0_s64x2, tmp1_s64x2;
+            Aold_QA_s32x4  = vld1q_s32(Aold_QA + n);
+            Aold_QAr_s32x4 = vld1q_s32(Aold_QA + k - n - 4);
+            Aold_QAr_s32x4 = vrev64q_s32(Aold_QAr_s32x4);
+            Aold_QAr_s32x4 = vcombine_s32(vget_high_s32(Aold_QAr_s32x4),
vget_low_s32(Aold_QAr_s32x4)); // Compiler should generate VSWP.
+            t_s32x4        = vqrdmulhq_lane_s32(Aold_QAr_s32x4, rc_Q31_s32x2,
0);
+            tmp_QA_s32x4   = vsubq_s32(Aold_QA_s32x4, t_s32x4);
+            tmp0_s64x2     = vmull_s32(vget_low_s32 (tmp_QA_s32x4),
rc_mult2_s32x2);
+            tmp1_s64x2     = vmull_s32(vget_high_s32(tmp_QA_s32x4),
rc_mult2_s32x2);
+            tmp0_s64x2     = vrshlq_s64(tmp0_s64x2, mult2Q_s64x2);
+            tmp1_s64x2     = vrshlq_s64(tmp1_s64x2, mult2Q_s64x2);
+            t_s32x4        = vcombine_s32(vmovn_s64(tmp0_s64x2),
vmovn_s64(tmp1_s64x2));
+            vst1q_s32(Anew_QA + n, t_s32x4);
+        }
+    }
+
+    /* Check for stability */
+    if( ( Anew_QA[ 0 ] > A_LIMIT ) || ( Anew_QA[ 0 ] < -A_LIMIT ) ) {
+        return 0;
+    }
+
+    /* Set RC equal to negated AR coef */
+    rc_Q31 = -silk_LSHIFT( Anew_QA[ 0 ], 31 - QA );
+
+    /* Range: [ 1 : 2^30 ] */
+    rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31
);
+
+    /* Update inverse gain */
+    /* Range: [ 0 : 2^30 ] */
+    invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+    silk_assert( invGain_Q30 >= 0     );
+    silk_assert( invGain_Q30 <= 1<<30 );
+
+    return invGain_Q30;
+}
+
+/* For input in Q12 domain */
+opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse
prediction gain in energy domain, Q30        */
+    const opus_int16            *A_Q12,             /* I   Prediction
coefficients, Q12 [order]                         */
+    const opus_int              order               /* I   Prediction order    
*/
+)
+{
+    opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ];
+    opus_int32 DC_resp = 0;
+    int16x8_t  t0_s16x8, t1_s16x8;
+    int32x4_t  t0_s32x4;
+
+    /* Increase Q domain of the AR coefficients */
+    silk_assert(!(order & 1)); // order is even
+    silk_assert(SILK_MAX_ORDER_LPC <= 16);
+    t0_s16x8 = vld1q_s16(A_Q12);
+    t1_s16x8 = vld1q_s16(A_Q12 + 8);
+    t0_s32x4 = vpaddlq_s16(t0_s16x8);
+    switch( order )
+    {
+    case 16:
+    {
+        int32x2_t t_s32x2;
+        int64x1_t t_s64x1;
+        t0_s32x4 = vpadalq_s16(t0_s32x4, t1_s16x8);
+        t_s32x2  = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+        t_s64x1  = vpaddl_s32(t_s32x2);
+        DC_resp  = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 14:
+    {
+        int64x1_t t_s64x1;
+        int32x4_t t1_s32x4 = vpaddlq_s16(t1_s16x8);
+        int32x2_t t_s32x2  = vpadd_s32(vget_low_s32(t0_s32x4),
vget_high_s32(t0_s32x4));
+        t_s32x2            = vadd_s32(t_s32x2, vget_low_s32(t1_s32x4));
+        t_s64x1            = vpaddl_s32(t_s32x2);
+        t_s64x1            =
vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1),
vget_high_s32(t1_s32x4)));
+        DC_resp            = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 12:
+    {
+        int64x1_t t_s64x1;
+        int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4),
vget_high_s32(t0_s32x4));
+        int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8));
+        t0_s32x2           = vadd_s32(t0_s32x2, t1_s32x2);
+        t_s64x1            = vpaddl_s32(t0_s32x2);
+        DC_resp            = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 10:
+    {
+        int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4),
vget_high_s32(t0_s32x4));
+        int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8));
+        int64x1_t t_s64x1  = vpaddl_s32(t0_s32x2);
+        t_s64x1            =
vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), t1_s32x2));
+        DC_resp            = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 8:
+        DC_resp += (opus_int32)A_Q12[ 7 ];
+        DC_resp += (opus_int32)A_Q12[ 6 ];
+
+    case 6:
+        DC_resp += (opus_int32)A_Q12[ 5 ];
+        DC_resp += (opus_int32)A_Q12[ 4 ];
+
+    case 4:
+        DC_resp += (opus_int32)A_Q12[ 3 ];
+        DC_resp += (opus_int32)A_Q12[ 2 ];
+
+    case 2:
+        DC_resp += (opus_int32)A_Q12[ 1 ];
+        DC_resp += (opus_int32)A_Q12[ 0 ];
+
+    default:
+        break;
+    }
+
+    /* If the DC is unstable, we don't even need to do the full
calculations */
+    if( DC_resp >= 4096 ) {
+        return 0;
+    }
+    vst1q_s32(Atmp_QA[ 0 ],      vshll_n_s16(vget_low_s16 (t0_s16x8), QA -
12));
+    vst1q_s32(Atmp_QA[ 0 ] +  4, vshll_n_s16(vget_high_s16(t0_s16x8), QA -
12));
+    vst1q_s32(Atmp_QA[ 0 ] +  8, vshll_n_s16(vget_low_s16 (t1_s16x8), QA -
12));
+    vst1q_s32(Atmp_QA[ 0 ] + 12, vshll_n_s16(vget_high_s16(t1_s16x8), QA -
12));
+
+    return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+}
+
+#ifdef FIXED_POINT
+
+/* For input in Q24 domain */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_neon(     /* O    Returns inverse
prediction gain in energy domain, Q30       */
+    const opus_int32            *A_Q24,             /* I    Prediction
coefficients [order]                             */
+    const opus_int              order               /* I    Prediction order   
*/
+)
+{
+    opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ];
+
+    /* Increase Q domain of the AR coefficients */
+    silk_assert(!(order & 1)); // order is even
+    silk_assert(SILK_MAX_ORDER_LPC == 16);
+    silk_assert(QA == 24); // No shift.
+    vst1q_s32(Atmp_QA[ 0 ],      vld1q_s32(A_Q24));
+    vst1q_s32(Atmp_QA[ 0 ] +  4, vld1q_s32(A_Q24 +  4));
+    vst1q_s32(Atmp_QA[ 0 ] +  8, vld1q_s32(A_Q24 +  8));
+    vst1q_s32(Atmp_QA[ 0 ] + 12, vld1q_s32(A_Q24 + 12));
+
+    return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+}
+
+#endif
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 2e330c4..59ceb6e 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -30,11 +30,21 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 #include "NSQ.h"
+#include "SigProc_FIX.h"
 
 #if defined(OPUS_HAVE_RTCD)
 
-# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
- !defined(OPUS_ARM_PRESUME_NEON_INTR))
+# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR))
+
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O  
Returns inverse prediction gain in energy domain, Q30        */
+        const opus_int16            *A_Q12,                             /* I  
Prediction coefficients, Q12 [order]                         */
+        const opus_int              order                               /* I  
Prediction order                                             */
+) = {
+      silk_LPC_inverse_pred_gain_c,              /* ARMv4 */
+      silk_LPC_inverse_pred_gain_c,              /* EDSP */
+      silk_LPC_inverse_pred_gain_c,              /* Media */
+      MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */
+};
 
 /*There is no table for silk_noise_shape_quantizer_short_prediction because the
    NEON version takes different parameters than the C version.
@@ -56,6 +66,16 @@ opus_int32
 #if defined(FIXED_POINT) && \
  defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
 
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK + 1])( /*
O    Returns inverse prediction gain in energy domain, Q30       */
+        const opus_int32            *A_Q24,                                 /*
I    Prediction coefficients [order]                             */
+        const opus_int              order                                   /*
I    Prediction order                                            */
+) = {
+      silk_LPC_inverse_pred_gain_Q24_c,              /* ARMv4 */
+      silk_LPC_inverse_pred_gain_Q24_c,              /* EDSP */
+      silk_LPC_inverse_pred_gain_Q24_c,              /* Media */
+      MAY_HAVE_NEON(silk_LPC_inverse_pred_gain_Q24), /* Neon */
+};
+
 void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
           opus_int32                *corr,                                  /*
O    Result [order + 1]                                                         
*/
           opus_int                  *scale,                                 /*
O    Scaling of the correlation vector                                          
*/
diff --git a/silk/decode_parameters.c b/silk/decode_parameters.c
index e345b1d..a56a409 100644
--- a/silk/decode_parameters.c
+++ b/silk/decode_parameters.c
@@ -52,7 +52,7 @@ void silk_decode_parameters(
     silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices,
psDec->psNLSF_CB );
 
     /* Convert NLSF parameters to AR prediction filter coefficients */
-    silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15,
psDec->LPC_order );
+    silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15,
psDec->LPC_order, psDec->arch );
 
     /* If just reset, e.g., because internal Fs changed, do not allow
interpolation */
     /* improves the case of packet loss in the first frame after a switch      
*/
@@ -69,7 +69,7 @@ void silk_decode_parameters(
         }
 
         /* Convert NLSF parameters to AR prediction filter coefficients */
-        silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15,
psDec->LPC_order );
+        silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15,
psDec->LPC_order, psDec->arch );
     } else {
         /* Copy LPC coefficients for first half from second half */
         silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ],
psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) );
diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c
index e11cdc8..e55b63a 100644
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -92,7 +92,7 @@ void silk_find_LPC_FIX(
             silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15,
k, psEncC->predictLPCOrder );
 
             /* Convert to LPC for residual energy evaluation */
-            silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
+            silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder,
psEncC->arch );
 
             /* Calculate residual energy with NLSF interpolation */
             silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length,
psEncC->predictLPCOrder, psEncC->arch );
diff --git a/silk/float/find_LPC_FLP.c b/silk/float/find_LPC_FLP.c
index fcfe1c3..4d63964 100644
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@@ -73,7 +73,7 @@ void silk_find_LPC_FLP(
             silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15,
k, psEncC->predictLPCOrder );
 
             /* Convert to LPC for residual energy evaluation */
-            silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder );
+            silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder,
psEncC->arch );
 
             /* Calculate residual energy with LSF interpolation */
             silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length,
psEncC->predictLPCOrder );
diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h
index e5a7597..c2105a5 100644
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -285,7 +285,8 @@ void silk_A2NLSF_FLP(
 void silk_NLSF2A_FLP(
     silk_float                      *pAR,                               /* O   
LPC coefficients [ LPC_order ]              */
     const opus_int16                *NLSF_Q15,                          /* I   
NLSF vector      [ LPC_order ]              */
-    const opus_int                  LPC_order                           /* I   
LPC order                                   */
+    const opus_int                  LPC_order,                          /* I   
LPC order                                   */
+    int                             arch                                /* I   
Run-time architecture                       */
 );
 
 /* Limit, stabilize, and quantize NLSFs */
diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index 6666b8e..53a556e 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -54,13 +54,14 @@ void silk_A2NLSF_FLP(
 void silk_NLSF2A_FLP(
     silk_float                      *pAR,                               /* O   
LPC coefficients [ LPC_order ]              */
     const opus_int16                *NLSF_Q15,                          /* I   
NLSF vector      [ LPC_order ]              */
-    const opus_int                  LPC_order                           /* I   
LPC order                                   */
+    const opus_int                  LPC_order,                          /* I   
LPC order                                   */
+    int                             arch                                /* I   
Run-time architecture                       */
 )
 {
     opus_int   i;
     opus_int16 a_fix_Q12[ MAX_LPC_ORDER ];
 
-    silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order );
+    silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch );
 
     for( i = 0; i < LPC_order; i++ ) {
         pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f );
diff --git a/silk/init_decoder.c b/silk/init_decoder.c
index f887c67..16c03dc 100644
--- a/silk/init_decoder.c
+++ b/silk/init_decoder.c
@@ -44,6 +44,7 @@ opus_int silk_init_decoder(
     /* Used to deactivate LSF interpolation */
     psDec->first_frame_after_reset = 1;
     psDec->prev_gain_Q16 = 65536;
+    psDec->arch = opus_select_arch();
 
     /* Reset CNG state */
     silk_CNG_Reset( psDec );
diff --git a/silk/process_NLSFs.c b/silk/process_NLSFs.c
index 0ab71f0..2f10f8d 100644
--- a/silk/process_NLSFs.c
+++ b/silk/process_NLSFs.c
@@ -89,7 +89,7 @@ void silk_process_NLSFs(
         NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors,
psEncC->indices.signalType );
 
     /* Convert quantized NLSFs back to LPC coefficients */
-    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder );
+    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder,
psEncC->arch );
 
     if( doInterpolate ) {
         /* Calculate the interpolated, quantized LSF vector for the first half
*/
@@ -97,7 +97,7 @@ void silk_process_NLSFs(
             psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );
 
         /* Convert back to LPC coefficients */
-        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15,
psEncC->predictLPCOrder );
+        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15,
psEncC->predictLPCOrder, psEncC->arch );
 
     } else {
         /* Copy LPC coefficients for first half from second half */
diff --git a/silk/structs.h b/silk/structs.h
index 827829d..b68e4c9 100644
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -301,6 +301,7 @@ typedef struct {
     /* Stuff used for PLC */
     opus_int                    lossCnt;
     opus_int                    prevSignalType;
+    int                         arch;
 
     silk_PLC_struct sPLC;
 
diff --git a/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
new file mode 100644
index 0000000..e98f3f6
--- /dev/null
+++ b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "SigProc_FIX.h"
+
+static int test_silk_LPC_inverse_pred_gain(int arch)
+{
+    unsigned int i;
+    opus_int     order;
+    opus_int16   A_Q12[ SILK_MAX_ORDER_LPC ];
+    opus_int32   rtn_org, rtn_opt;
+    (void)arch;
+
+    printf("%50s", "LPC_inverse_pred_gain() ...");
+    for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must
be even.
+    {
+        for (unsigned int shift = 0; shift < 16; shift++) // Test dynamic
range.
+        {
+            for (i = 0; i < SILK_MAX_ORDER_LPC; i++)
+            {
+                A_Q12[i] = ((opus_int16)rand()) >> shift;
+            }
+
+            rtn_org = silk_LPC_inverse_pred_gain_c(A_Q12, order);
+            rtn_opt = silk_LPC_inverse_pred_gain  (A_Q12, order);
+            if ((rtn_org != rtn_opt))
+            {
+                printf("order=%2d failed!\n", order);
+                printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt);
+                return -1;
+            }
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
+
+#ifdef FIXED_POINT
+
+static int test_silk_LPC_inverse_pred_gain_Q24(int arch)
+{
+    unsigned int i;
+    opus_int     order;
+    opus_int32   A_Q24[ SILK_MAX_ORDER_LPC ];
+    opus_int32   rtn_org, rtn_opt;
+    (void)arch;
+
+    printf("%50s", "LPC_inverse_pred_gain_Q24() ...");
+    for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must
be even.
+    {
+        for (unsigned int shift = 0; shift < 31; shift++) // Test dynamic
range.
+        {
+            for (i = 0; i < SILK_MAX_ORDER_LPC; i++)
+            {
+                A_Q24[i] = ((opus_int32)rand()) >> shift;
+            }
+
+            rtn_org = silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order);
+            rtn_opt = silk_LPC_inverse_pred_gain_Q24  (A_Q24, order);
+            if ((rtn_org != rtn_opt))
+            {
+                printf("order=%2d failed!\n", order);
+                printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt);
+                return -1;
+            }
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index 52c42d0..ca9bf27 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -22,6 +22,7 @@ silk/resampler_rom.h \
 silk/resampler_structs.h \
 silk/SigProc_FIX.h \
 silk/x86/SigProc_FIX_sse.h \
+silk/arm/LPC_inv_pred_gain_arm.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
 silk/arm/macros_arm64.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 5f9551b..d8323df 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -84,6 +84,7 @@ silk/x86/VQ_WMat_EC_sse.c
 
 SILK_SOURCES_ARM_NEON_INTR = \
 silk/arm/arm_silk_map.c \
+silk/arm/LPC_inv_pred_gain_neon_intr.c \
 silk/arm/NSQ_neon.c
 
 SILK_SOURCES_FIXED = \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index b5c25d9..8e90074 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -29,6 +29,7 @@
 #endif
 
 #include <stdio.h>
+#include "cpu_support.h"
 #include "stack_alloc.h"
 
 #define SKIP_CONFIG_H
@@ -44,21 +45,25 @@
 
 #endif
 
+# include "silk/tests/test_unit_optimization_LPC_inv_pred_gain.c"
+
 int main(void)
 {
    int result = 0; /* 0: passed; other: failed */
    ALLOC_STACK;
-#ifdef FIXED_POINT
    int arch = opus_select_arch();
-#endif /* FIXED_POINT */
    int count = 10;
 
+   srand(0);
+
    while (!result && count--) {
       printf("\n--------------------------- Testing optimization
---------------------------\n");
 #ifdef FIXED_POINT
       result |= test_fir(arch);
+      result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
       result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
+      result |= test_silk_LPC_inverse_pred_gain(arch);
    }
    return result;
 }
-- 
2.8.0.rc3.226.g39d4020

Timothy B. Terriberry

2016-Sep-28 01:42 UTC

head link

[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON

Linfeng Zhang wrote:> +#ifdef SMALL_FOOTPRINT
> +   for (i=0;i<N-7;i+=8)
> +   {
> [snip over 80 lines of complicated NEON intrinsics code]
> +   }
> +#else
So, one of the points of SMALL_FOOTPRINT is to reduce the code size on 
targets where this matters (even if it means running slower), but this 
is an awful lot of code.

I think it makes much more sense to expose the existing xcorr_kernel asm 
and use that. I wrote a simple patch demonstrating this (attached... it 
applies on top of your full series, so it'd be a little work to rebase 
it into place here). It adds one 16-byte table and 16 instructions, and 
even gives speed-ups on non-NEON CPUs by reusing the existing EDSP asm.

Testing on comp48-stereo.sw encoded to 64 kbps and decoded with a 15% 
loss rate on a Novena using opus_demo (by using RTCD and changing the 
function pointers to the version of the code to test), optimizing 
xcorr_kernel gives almost as much speed-up as intrinsics for all of 
celt_fir:

celt_fir_c, xcorr_kernel_c:
1753 ms (stddev 9) [1730 1740 {1740 1740 1740 1750 1750 1750 1750 1750 
1750 1750 1750 1750 1750 1750 1760 1760 1760 1760 1770 1770} 1780 1860]

celt_fir_c, xcorr_kernel_neon:
1710 ms (stddev 12) [1680 1690 {1690 1690 1700 1700 1700 1700 1710 1710 
1710 1710 1710 1710 1710 1710 1710 1720 1720 1730 1730 1730} 1740 1810]

celt_fir_neon:
1695 ms (stddev 9) [1670 1680 {1680 1680 1680 1690 1690 1690 1690 1690 
1690 1690 1700 1700 1700 1700 1700 1700 1700 1700 1710 1710} 1720 1790]

It might even be enough to use this for the non-SMALL_FOOTPRINT case. 
What do you think?

Apparently Analagous Threads

Search for more apparently analagous threads

opus - Jul 2016 - [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON

[opus] Several patches of ARM NEON optimization

[opus] [PATCH 1/5] Revise celt_fir_c() to not pass in argument "mem"

[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON

[opus] [PATCH 3/5] Optimize silk_warped_autocorrelation_FIX() for ARM NEON

[opus] [PATCH 4/5] Refactor silk_warped_autocorrelation_FIX_neon()

[opus] [PATCH 5/5] Optimize silk/LPC_inv_pred_gain.c for ARM NEON

[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON

Apparently Analagous Threads