thr3ads.net - opus - [opus] ARM NEON optimization -- celt

If this information is useful, please help other people find it:
Share via:

Linfeng Zhang

2016-Jun-17 21:09 UTC

[opus] ARM NEON optimization -- celt_fir()

Hi all,

This is Linfeng Zhang from Google. I'll work on ARM NEON optimization in the
next few months.

I'm submitting 2 patches in the following couple of emails, which have the
new
created celt_fir_neon().

I revised celt_fir_c() to not pass in argument "mem" in Patch 1. If
there are
concerns to this change, please let me know.

Many thanks to your comments.

Linfeng Zhang

Linfeng Zhang

2016-Jun-17 21:09 UTC

head link

[opus] [PATCH 1/2] Revise celt_fir_c() to not pass in argument "mem"

The "mem" in celt_fir_c() either is contained in the head of input
"x" in
reverse order already, or can be easily attached to the head of "x"
before
calling the function. Removing argument "mem" can eliminate the
redundant buffer
copies inside.
Update celt_fir_sse4_1() accordingly.
---
 celt/celt_decoder.c        | 10 ++++-----
 celt/celt_lpc.c            | 33 +++++++++++-------------------
 celt/celt_lpc.h            |  5 ++---
 celt/x86/celt_lpc_sse.c    | 51 +++++++++-------------------------------------
 celt/x86/celt_lpc_sse.h    | 10 ++++-----
 celt/x86/x86_celt_map.c    |  1 -
 silk/LPC_analysis_filter.c |  6 +-----
 7 files changed, 34 insertions(+), 82 deletions(-)

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index b978bb3..f8433eb 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -509,7 +509,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st,
int N, int LM)
       opus_val16 fade = Q15ONE;
       int pitch_index;
       VARDECL(opus_val32, etmp);
-      VARDECL(opus_val16, exc);
+      VARDECL(opus_val16, _exc);
 
       if (loss_count == 0)
       {
@@ -520,7 +520,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st,
int N, int LM)
       }
 
       ALLOC(etmp, overlap, opus_val32);
-      ALLOC(exc, MAX_PERIOD, opus_val16);
+      ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+      opus_val16 *exc = _exc+LPC_ORDER;
       window = mode->window;
       c=0; do {
          opus_val16 decay;
@@ -568,15 +569,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT
st, int N, int LM)
          /* Initialize the LPC history with the samples just before the start
             of the region for which we're computing the excitation. */
          {
-            opus_val16 lpc_mem[LPC_ORDER];
             for (i=0;i<LPC_ORDER;i++)
             {
-               lpc_mem[i] +               exc[MAX_PERIOD-exc_length-1-i]       
ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT);
             }
             /* Compute the excitation for exc_length samples before the loss.
*/
             celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem,
st->arch);
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER,
st->arch);
          }
 
          /* Check if the waveform is decaying, and if so how fast.
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index f02145a..80124df 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -90,56 +90,47 @@ int          p
 
 
 void celt_fir_c(
-         const opus_val16 *_x,
+         const opus_val16 *x,
          const opus_val16 *num,
-         opus_val16 *_y,
+         opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch)
 {
    int i,j;
    VARDECL(opus_val16, rnum);
-   VARDECL(opus_val16, x);
    SAVE_STACK;
 
    ALLOC(rnum, ord, opus_val16);
-   ALLOC(x, N+ord, opus_val16);
    for(i=0;i<ord;i++)
       rnum[i] = num[ord-i-1];
-   for(i=0;i<ord;i++)
-      x[i] = mem[ord-i-1];
-   for (i=0;i<N;i++)
-      x[i+ord]=_x[i];
-   for(i=0;i<ord;i++)
-      mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
    (void)arch;
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
       }
-      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+      y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
 #else
    for (i=0;i<N-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(rnum, x+i, sum, ord, arch);
-      _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0],
SIG_SHIFT)));
-      _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1],
SIG_SHIFT)));
-      _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2],
SIG_SHIFT)));
-      _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3],
SIG_SHIFT)));
+      xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+      y[i  ] = SATURATE16(ADD32(EXTEND32(x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
+      y[i+1] = SATURATE16(ADD32(EXTEND32(x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
+      y[i+2] = SATURATE16(ADD32(EXTEND32(x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
+      y[i+3] = SATURATE16(ADD32(EXTEND32(x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
    }
    for (;i<N;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<ord;j++)
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
-      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
    }
 #endif
    RESTORE_STACK;
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index 323459e..a4c5fd6 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -45,12 +45,11 @@ void celt_fir_c(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
 #if !defined(OVERRIDE_CELT_FIR)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
-    (celt_fir_c(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+    (celt_fir_c(x, num, y, N, ord, arch))
 #endif
 
 void celt_iir(const opus_val32 *x,
diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
index 67e5592..12a9b0e 100644
--- a/celt/x86/celt_lpc_sse.c
+++ b/celt/x86/celt_lpc_sse.c
@@ -40,63 +40,32 @@
 
 #if defined(FIXED_POINT)
 
-void celt_fir_sse4_1(const opus_val16 *_x,
+void celt_fir_sse4_1(const opus_val16 *x,
          const opus_val16 *num,
-         opus_val16 *_y,
+         opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch)
 {
     int i,j;
     VARDECL(opus_val16, rnum);
-    VARDECL(opus_val16, x);
 
     __m128i vecNoA;
     opus_int32 noA ;
     SAVE_STACK;
 
    ALLOC(rnum, ord, opus_val16);
-   ALLOC(x, N+ord, opus_val16);
    for(i=0;i<ord;i++)
       rnum[i] = num[ord-i-1];
-   for(i=0;i<ord;i++)
-      x[i] = mem[ord-i-1];
-
-   for (i=0;i<N-7;i+=8)
-   {
-       x[i+ord  ]=_x[i  ];
-       x[i+ord+1]=_x[i+1];
-       x[i+ord+2]=_x[i+2];
-       x[i+ord+3]=_x[i+3];
-       x[i+ord+4]=_x[i+4];
-       x[i+ord+5]=_x[i+5];
-       x[i+ord+6]=_x[i+6];
-       x[i+ord+7]=_x[i+7];
-   }
-
-   for (;i<N-3;i+=4)
-   {
-       x[i+ord  ]=_x[i  ];
-       x[i+ord+1]=_x[i+1];
-       x[i+ord+2]=_x[i+2];
-       x[i+ord+3]=_x[i+3];
-   }
-
-   for (;i<N;i++)
-         x[i+ord]=_x[i];
-
-   for(i=0;i<ord;i++)
-      mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
       }
-      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+      y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
 #else
    noA = EXTEND32(1) << SIG_SHIFT >> 1;
@@ -107,22 +76,22 @@ void celt_fir_sse4_1(const opus_val16 *_x,
       opus_val32 sums[4] = {0};
       __m128i vecSum, vecX;
 
-      xcorr_kernel(rnum, x+i, sums, ord, arch);
+      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
 
       vecSum = _mm_loadu_si128((__m128i *)sums);
       vecSum = _mm_add_epi32(vecSum, vecNoA);
       vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
-      vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+      vecX = OP_CVTEPI16_EPI32_M64(x + i);
       vecSum = _mm_add_epi32(vecSum, vecX);
       vecSum = _mm_packs_epi32(vecSum, vecSum);
-      _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+      _mm_storel_epi64((__m128i *)(y + i), vecSum);
    }
    for (;i<N;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<ord;j++)
-         sum = MAC16_16(sum, rnum[j], x[i + j]);
-      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+         sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
    }
 
 #endif
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index c5ec796..7d1ecf7 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -41,12 +41,11 @@ void celt_fir_sse4_1(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
-    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
 #else
 
@@ -56,11 +55,10 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
-#  define celt_fir(x, num, y, N, ord, mem, arch) \
-    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem,
arch))
+#  define celt_fir(x, num, y, N, ord, arch) \
+    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
 #endif
 #endif
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
index 8e5e449..1331c10 100644
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -46,7 +46,6 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          opus_val16       *y,
          int              N,
          int              ord,
-         opus_val16       *mem,
          int              arch
 ) = {
   celt_fir_c,                /* non-sse */
diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c
index 2090667..5aeee4c 100644
--- a/silk/LPC_analysis_filter.c
+++ b/silk/LPC_analysis_filter.c
@@ -50,7 +50,6 @@ void silk_LPC_analysis_filter(
 {
     opus_int   j;
 #ifdef FIXED_POINT
-    opus_int16 mem[SILK_MAX_ORDER_LPC];
     opus_int16 num[SILK_MAX_ORDER_LPC];
 #else
     int ix;
@@ -67,10 +66,7 @@ void silk_LPC_analysis_filter(
     for ( j = 0; j < d; j++ ) {
         num[ j ] = -B[ j ];
     }
-    for (j=0;j<d;j++) {
-        mem[ j ] = in[ d - j - 1 ];
-    }
-    celt_fir( in + d, num, out + d, len - d, d, mem, arch );
+    celt_fir( in + d, num, out + d, len - d, d, arch );
     for ( j = 0; j < d; j++ ) {
         out[ j ] = 0;
     }
-- 
2.8.0.rc3.226.g39d4020

Linfeng Zhang

2016-Jun-17 21:09 UTC

head link

[opus] [PATCH 2/2] Optimize fixed-point celt_fir_c() for ARM NEON

Create the fixed-point intrinsics optimization celt_fir_neon() for ARM NEON.
Create test celt/tests/test_unit_optimization_lpc to unit test the optimization.
---
 .gitignore                              |   2 +
 Makefile.am                             |  17 ++-
 celt/arm/arm_celt_map.c                 |  17 +++
 celt/arm/celt_lpc_arm.h                 |  65 ++++++++
 celt/arm/celt_lpc_neon_intr.c           | 254 ++++++++++++++++++++++++++++++++
 celt/celt_lpc.h                         |   5 +
 celt/tests/test_unit_dft.c              |   1 +
 celt/tests/test_unit_mathops.c          |   1 +
 celt/tests/test_unit_mdct.c             |   1 +
 celt/tests/test_unit_optimization_lpc.c | 121 +++++++++++++++
 celt/tests/test_unit_rotation.c         |   1 +
 celt_headers.mk                         |   1 +
 celt_sources.mk                         |   1 +
 13 files changed, 482 insertions(+), 5 deletions(-)
 create mode 100644 celt/arm/celt_lpc_arm.h
 create mode 100644 celt/arm/celt_lpc_neon_intr.c
 create mode 100644 celt/tests/test_unit_optimization_lpc.c

diff --git a/.gitignore b/.gitignore
index 33127c9..6d3b48b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ install-sh
 .deps
 .libs
 .dirstamp
+.project
 *.a
 *.exe
 *.la
@@ -57,6 +58,7 @@ celt/tests/test_unit_entropy
 celt/tests/test_unit_laplace
 celt/tests/test_unit_mathops
 celt/tests/test_unit_mdct
+celt/tests/test_unit_optimization_lpc
 celt/tests/test_unit_rotation
 celt/tests/test_unit_types
 doc/doxygen_sqlite3.db
diff --git a/Makefile.am b/Makefile.am
index cfdaced..e06d687 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -83,9 +83,9 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h
include/opus_type
 noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
 
 if EXTRA_PROGRAMS
-noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api
tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding
celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft
celt/tests/test_unit_entropy celt/tests/test_unit_laplace
celt/tests/test_unit_mathops celt/tests/test_unit_mdct
celt/tests/test_unit_rotation celt/tests/test_unit_types
+noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api
tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding
celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft
celt/tests/test_unit_entropy celt/tests/test_unit_laplace
celt/tests/test_unit_mathops celt/tests/test_unit_mdct
celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation
celt/tests/test_unit_types
 
-TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops
celt/tests/test_unit_entropy celt/tests/test_unit_laplace
celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_rotation
celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode
tests/test_opus_encode tests/test_opus_padding
+TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops
celt/tests/test_unit_entropy celt/tests/test_unit_laplace
celt/tests/test_unit_dft celt/tests/test_unit_mdct
celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation
celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode
tests/test_opus_encode tests/test_opus_padding
 
 opus_demo_SOURCES = src/opus_demo.c
 
@@ -137,6 +137,12 @@ if OPUS_ARM_EXTERNAL_ASM
 celt_tests_test_unit_mdct_LDADD += libarmasm.la
 endif
 
+celt_tests_test_unit_optimization_lpc_SOURCES =
celt/tests/test_unit_optimization_lpc.c
+celt_tests_test_unit_optimization_lpc_LDADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+celt_tests_test_unit_optimization_lpc_LDADD += libarmasm.la
+endif
+
 celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
 celt_tests_test_unit_rotation_LDADD = $(NE10_LIBS) $(LIBM)
 if OPUS_ARM_EXTERNAL_ASM
@@ -272,10 +278,11 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S):
$(top_srcdir)/celt/arm/arm2gnu.pl
 %-gnu.S: %.s
 	$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
 
-OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
-                    $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
+OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_dft_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
-                    $(celt_tests_test_unit_dft_SOURCES:.c=.o)
+                    $(celt_tests_test_unit_optimization_lpc_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_rotation_SOURCES:.c=.o)
 
 if HAVE_SSE
 SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index ee6c244..4e90fde 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -29,6 +29,7 @@
 #include "config.h"
 #endif
 
+#include "celt_lpc.h"
 #include "pitch.h"
 #include "kiss_fft.h"
 #include "mdct.h"
@@ -36,6 +37,22 @@
 #if defined(OPUS_HAVE_RTCD)
 
 # if defined(FIXED_POINT)
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         int arch);
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val16 *, int, int, int) = {
+  celt_fir_c,             /* ARMv4 */
+  celt_fir_c,             /* EDSP */
+  celt_fir_c,             /* Media */
+  MAY_HAVE_NEON(celt_fir) /* NEON */
+};
+
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
     const opus_val16 *, opus_val32 *, int , int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
diff --git a/celt/arm/celt_lpc_arm.h b/celt/arm/celt_lpc_arm.h
new file mode 100644
index 0000000..101df3d
--- /dev/null
+++ b/celt/arm/celt_lpc_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(CELT_LPC_ARM_H)
+# define CELT_LPC_ARM_H
+
+# include "armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         int arch);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_CELT_FIR (1)
+#   define celt_fir(x, num, y, N, ord, arch) \
+  ((void)(arch),PRESUME_NEON(celt_fir)(x, num, y, N, ord, arch))
+#  endif
+
+#if !defined(OVERRIDE_CELT_FIR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val16 *, int, int, int);
+
+#  define OVERRIDE_CELT_FIR
+#   define celt_fir(x, num, y, N, ord, arch) \
+  ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, arch))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end CELT_LPC_ARM_H */
diff --git a/celt/arm/celt_lpc_neon_intr.c b/celt/arm/celt_lpc_neon_intr.c
new file mode 100644
index 0000000..4715d0b
--- /dev/null
+++ b/celt/arm/celt_lpc_neon_intr.c
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file celt_lpc_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt lpc functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         int arch)
+{
+   int i,j;
+   const int leftover = N & 7;
+   const opus_val16 *x = _x-ord;
+   VARDECL(opus_val16, rnum);
+   SAVE_STACK;
+   /* Extend rnum by 3 zeros to handle the case that (ord % 4) is non-zero. */
+   ALLOC(rnum, ord+3, opus_val16);
+   for (i=0;i<ord-3;i+=4)
+      vst1_s16(rnum+i, vrev64_s16(vld1_s16(num+ord-i-4)));
+   for (;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   rnum[ord] = rnum[ord+1] = rnum[ord+2] = 0;
+   (void)arch;
+
+#ifdef SMALL_FOOTPRINT
+   for (i=0;i<N-7;i+=8)
+   {
+      int16x8_t x_s16x8 = vld1q_s16(_x+i);
+      int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+      int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+      for (j=0;j<ord;j+=4)
+      {
+         const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+         x_s16x8 = vld1q_s16(x+i+j+0);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+         x_s16x8 = vld1q_s16(x+i+j+1);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+         x_s16x8 = vld1q_s16(x+i+j+2);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+         x_s16x8 = vld1q_s16(x+i+j+3);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+      }
+      vst1q_s16(y+i, vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT),
vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT)));
+   }
+   if (leftover)
+   {
+      if (leftover > 4)
+      {
+         int16x8_t x_s16x8 = vld1q_s16(_x+i);
+         int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+         int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            x_s16x8 = vld1q_s16(x+i+j+0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+            x_s16x8 = vld1q_s16(x+i+j+1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+            x_s16x8 = vld1q_s16(x+i+j+2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+            x_s16x8 = vld1q_s16(x+i+j+3);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+         }
+         const int16x8_t y_s16x8 = vcombine_s16(vqrshrn_n_s32(sum0_s32x4,
SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT));
+         vst1_s16(y+i, vget_low_s16(y_s16x8));
+         vst1q_lane_s16(y+i+4, y_s16x8, 4);
+         if (leftover >= 6)
+         {
+            vst1q_lane_s16(y+i+5, y_s16x8, 5);
+            if (leftover == 7)
+            {
+               vst1q_lane_s16(y+i+6, y_s16x8, 6);
+            }
+         }
+      }
+      else {
+         int32x4_t sum0_s32x4 = vshll_n_s16(vld1_s16(_x+i), SIG_SHIFT);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0),
rnum_s16x4, 0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1),
rnum_s16x4, 1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2),
rnum_s16x4, 2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3),
rnum_s16x4, 3);
+         }
+         const int16x4_t y_s16x4 = vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT);
+         if (leftover == 4)
+         {
+            vst1_s16(y+i, y_s16x4);
+         }
+         else
+         {
+            vst1_lane_s16(y+i, y_s16x4, 0);
+            if (leftover >= 2)
+            {
+               vst1_lane_s16(y+i+1, y_s16x4, 1);
+               if (leftover == 3)
+               {
+                  vst1_lane_s16(y+i+2, y_s16x4, 2);
+               }
+            }
+         }
+      }
+   }
+#else
+   for (i=0;i<N-7;i+=8)
+   {
+      int32x4_t sum0_s32x4, sum1_s32x4;
+      sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+      for (j=0;j<ord;j+=4)
+      {
+         const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+         int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+         x_s16x8 = vld1q_s16(x+i+j+1);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+         x_s16x8 = vld1q_s16(x+i+j+2);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+         x_s16x8 = vld1q_s16(x+i+j+3);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+      }
+      sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+      sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+      const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+      sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+      sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+      vst1q_s16(y+i, vcombine_s16(vqmovn_s32(sum0_s32x4),
vqmovn_s32(sum1_s32x4)));
+   }
+   if (leftover)
+   {
+      if (leftover > 4)
+      {
+         int32x4_t sum0_s32x4, sum1_s32x4;
+         sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 0);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 0);
+            x_s16x8 = vld1q_s16(x+i+j+1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 1);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 1);
+            x_s16x8 = vld1q_s16(x+i+j+2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 2);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 2);
+            x_s16x8 = vld1q_s16(x+i+j+3);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8),
rnum_s16x4, 3);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8),
rnum_s16x4, 3);
+         }
+         sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+         sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+         const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+         sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+         sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+         const int16x8_t y_s16x8 = vcombine_s16(vqmovn_s32(sum0_s32x4),
vqmovn_s32(sum1_s32x4));
+         vst1_s16(y+i, vget_low_s16(y_s16x8));
+         vst1q_lane_s16(y+i+4, y_s16x8, 4);
+         if (leftover >= 6)
+         {
+            vst1q_lane_s16(y+i+5, y_s16x8, 5);
+            if (leftover == 7)
+            {
+               vst1q_lane_s16(y+i+6, y_s16x8, 6);
+            }
+         }
+      }
+      else {
+         int32x4_t sum0_s32x4 = vdupq_n_s32(0);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0),
rnum_s16x4, 0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1),
rnum_s16x4, 1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2),
rnum_s16x4, 2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3),
rnum_s16x4, 3);
+         }
+         sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+         sum0_s32x4 = vaddw_s16(sum0_s32x4, vld1_s16(_x+i));
+         const int16x4_t y_s16x4 = vqmovn_s32(sum0_s32x4);
+         if (leftover == 4)
+         {
+            vst1_s16(y+i, y_s16x4);
+         }
+         else
+         {
+            vst1_lane_s16(y+i, y_s16x4, 0);
+            if (leftover >= 2)
+            {
+               vst1_lane_s16(y+i+1, y_s16x4, 1);
+               if (leftover == 3)
+               {
+                  vst1_lane_s16(y+i+2, y_s16x4, 2);
+               }
+            }
+         }
+      }
+   }
+#endif
+   RESTORE_STACK;
+}
+
+#endif
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index a4c5fd6..76a73c0 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -35,6 +35,11 @@
 #include "x86/celt_lpc_sse.h"
 #endif
 
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+   || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/celt_lpc_arm.h"
+#endif
+
 #define LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 6166eb0..582618e 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -52,6 +52,7 @@
 # include "celt_lpc.c"
 # include "pitch.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "mdct.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index fd3319d..da92f16 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -66,6 +66,7 @@
 #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 # include "arm/armcpu.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 8dbb9ca..0658c7a 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -53,6 +53,7 @@
 # include "pitch.c"
 # include "celt_lpc.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_optimization_lpc.c
b/celt/tests/test_unit_optimization_lpc.c
new file mode 100644
index 0000000..146526e
--- /dev/null
+++ b/celt/tests/test_unit_optimization_lpc.c
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#define CELT_C
+#include "stack_alloc.h"
+#include "mathops.c"
+#include "entcode.c"
+
+#ifdef FIXED_POINT
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# include "celt_lpc.c"
+# include "pitch.c"
+# include "x86/x86cpu.c"
+# include "x86/celt_lpc_sse.c"
+# include "x86/pitch_sse2.c"
+# include "x86/pitch_sse4_1.c"
+# include "x86/x86_celt_map.c"
+#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "celt_lpc.c"
+# include "pitch.c"
+# include "arm/armcpu.c"
+# include "arm/arm_celt_map.c"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
+# endif
+#endif
+
+#define MAX_ORDER 32
+
+void test_fir(int arch)
+{
+   opus_val16 x[MAX_PERIOD+MAX_ORDER];
+   opus_val16 num[MAX_ORDER];
+   opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD];
+   int N, ord;
+
+   unsigned int i;
+   for(ord=0;ord<=MAX_ORDER;ord++)
+   {
+      printf("ord=%2d", ord);
+      for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */
+      {
+         for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
+         {
+            x[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_PERIOD;++i)
+         {
+            yorg[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_ORDER;++i)
+         {
+            num[i] = (rand() % 32767) - 16384;
+         }
+         memcpy(yopt, yorg, sizeof(yorg));
+
+         celt_fir_c(x+MAX_ORDER, num, yorg, N, ord, arch);
+         celt_fir  (x+MAX_ORDER, num, yopt, N, ord, arch);
+         if (memcmp(yorg, yopt, sizeof(yorg)))
+         {
+            printf(" N=%3d failed!\nError in lpc unit test!!!\n", N);
+            for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++)
+            {
+               if (yorg[i] != yopt[i])
+               {
+                  printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i],
i, yopt[i]);
+               }
+            }
+            exit(0);
+         }
+      }
+      printf(" passed!\n");
+   }
+}
+#endif /* FIXED_POINT */
+
+int main(void)
+{
+#ifdef FIXED_POINT
+   ALLOC_STACK;
+   int arch = opus_select_arch();
+   test_fir(arch);
+#endif /* FIXED_POINT */
+   return 0;
+}
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 1080c20..3a85a29 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -64,6 +64,7 @@
 #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 # include "arm/armcpu.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
diff --git a/celt_headers.mk b/celt_headers.mk
index 0eca6e6..4067e5a 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -34,6 +34,7 @@ celt/static_modes_fixed.h \
 celt/static_modes_float_arm_ne10.h \
 celt/static_modes_fixed_arm_ne10.h \
 celt/arm/armcpu.h \
+celt/arm/celt_lpc_arm.h \
 celt/arm/fixed_armv4.h \
 celt/arm/fixed_armv5e.h \
 celt/arm/kiss_fft_armv4.h \
diff --git a/celt_sources.mk b/celt_sources.mk
index 2ffe99a..37c0129 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -37,6 +37,7 @@ CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
 
 CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_lpc_neon_intr.c \
 celt/arm/celt_neon_intr.c
 
 CELT_SOURCES_ARM_NE10= \
-- 
2.8.0.rc3.226.g39d4020

Jonathan Lennox

2016-Jun-17 21:37 UTC

head link

[opus] ARM NEON optimization -- celt_fir()

Hi, Linfeng —

Please note the aarch64 optimization patches I submitted in November and
December (which Tim still hasn’t gotten around to reviewing).  As they used Neon
intrinsics, several of these actually applied to both armv7 and aarch64 Neon.

In particular, note
http://lists.xiph.org/pipermail/opus/2015-December/003339.html , which added a
Neon-optimized version of xcorr_kernel.  xcorr_kernel is used in celt_fir,
celt_iir, and celt_pitch_xcorr.
> On Jun 17, 2016, at 5:09 PM, Linfeng Zhang <linfengz at google.com>
wrote:
> 
> Hi all,
> 
> This is Linfeng Zhang from Google. I'll work on ARM NEON optimization
in the
> next few months.
> 
> I'm submitting 2 patches in the following couple of emails, which have
the new
> created celt_fir_neon().
> 
> I revised celt_fir_c() to not pass in argument "mem" in Patch 1.
If there are
> concerns to this change, please let me know.
> 
> Many thanks to your comments.
> 
> Linfeng Zhang
> 
> _______________________________________________
> opus mailing list
> opus at xiph.org
> http://lists.xiph.org/mailman/listinfo/opus

Linfeng Zhang

2016-Jun-17 22:26 UTC

head link

[opus] ARM NEON optimization -- celt_fir()

Hi Jonathan,

Thanks for pointing me to the related submissions!

I noticed the xcorr_kernel_neon() assembly in current code base, but don't
know why it's not activated for celt_fir() etc. So I decided to inline it
inside celt_fir() to save the sum[] buffer initializing/saving/loading, and
handle variable filter order "ord" differently.

Thanks,
Linfeng

On Fri, Jun 17, 2016 at 2:37 PM, Jonathan Lennox <jonathan at vidyo.com>
wrote:
> Hi, Linfeng —
>
> Please note the aarch64 optimization patches I submitted in November and
> December (which Tim still hasn’t gotten around to reviewing).  As they used
> Neon intrinsics, several of these actually applied to both armv7 and
> aarch64 Neon.
>
> In particular, note
> http://lists.xiph.org/pipermail/opus/2015-December/003339.html , which
> added a Neon-optimized version of xcorr_kernel.  xcorr_kernel is used in
> celt_fir, celt_iir, and celt_pitch_xcorr.
>
> > On Jun 17, 2016, at 5:09 PM, Linfeng Zhang <linfengz at
google.com> wrote:
> >
> > Hi all,
> >
> > This is Linfeng Zhang from Google. I'll work on ARM NEON
optimization in
> the
> > next few months.
> >
> > I'm submitting 2 patches in the following couple of emails, which
have
> the new
> > created celt_fir_neon().
> >
> > I revised celt_fir_c() to not pass in argument "mem" in
Patch 1. If
> there are
> > concerns to this change, please let me know.
> >
> > Many thanks to your comments.
> >
> > Linfeng Zhang
> >
> > _______________________________________________
> > opus mailing list
> > opus at xiph.org
> > http://lists.xiph.org/mailman/listinfo/opus
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.xiph.org/pipermail/opus/attachments/20160617/2dc691ef/attachment.html>

Timothy B. Terriberry

2016-Jun-17 22:31 UTC

head link

[opus] ARM NEON optimization -- celt_fir()

Jonathan Lennox wrote:> Hi, Linfeng —
>
> Please note the aarch64 optimization patches I submitted in November and
December (which Tim still hasn’t gotten around to reviewing).  As they used Neon
intrinsics, several of these actually applied to both armv7 and aarch64 Neon.
Yes, it seems I need to move this up my priority list to avoid everyone 
wasting time on duplicate work. Fortunately, I have a long plane flight 
tomorrow...

If you have any comments on Jonathan's patches (any of them), Linfeng, 
that would also be good to know.

Seemingly Similar Threads

Search for more possibly parallel threads

opus - Jun 2016 - ARM NEON optimization -- celt_fir()

[opus] ARM NEON optimization -- celt_fir()

[opus] [PATCH 1/2] Revise celt_fir_c() to not pass in argument "mem"

[opus] [PATCH 2/2] Optimize fixed-point celt_fir_c() for ARM NEON

[opus] ARM NEON optimization -- celt_fir()

[opus] ARM NEON optimization -- celt_fir()

[opus] ARM NEON optimization -- celt_fir()

Seemingly Similar Threads