Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 0/8] Patches for arm64 (aarch64) support
This sequence of patches provides arm64 support for Opus.  Tested on
iOS, Android, and Ubuntu 14.04. 
The patch sequence was written on top of Viswanath Puttagunta's Ne10
patches, but all but the second ("Reorganize pitch_arm.h") should, I
think, apply independently of it.  It does depends on my previous
intrinsics configury reorganization, however.
Comments welcome.
With this and my previous patch set, I believe it should be safe to
turn --enable-intrinsics on by default in Opus's configure script.
Jonathan Lennox (8):
  Move ARM-specific macro overrides to arm-specific file.
  Reorganize pitch_arm.h, so RTCD works for intrinsics functions as
    well.
  Autoconf changes for arm64 inline assembly support.
  Arm64 assembly for Celt fixed-point math.
  Arm64 assembly for Silk math.
  Add Neon intrinsics for Silk noise shape quantization.
  Add Neon intrinsics for Silk noise shape feedback loop.
  Apply Neon short prediction optimization to
    silk_noise_shape_quantizer_del_dec.
 Makefile.am                    |   8 ++-
 celt/arch.h                    |   2 +
 celt/arm/arm_celt_map.c        |  24 +++++++-
 celt/arm/fixed_arm64.h         |  75 ++++++++++++++++++++++++
 celt/arm/pitch_arm.h           |  94 ++++++++++++++++++++++-------
 celt/pitch.h                   |  19 ------
 celt_headers.mk                |   1 +
 configure.ac                   |  19 ++++++
 silk/NSQ.c                     |  55 +++++------------
 silk/NSQ.h                     |  97 ++++++++++++++++++++++++++++++
 silk/NSQ_del_dec.c             |  37 +++++-------
 silk/SigProc_FIX.h             |   4 ++
 silk/arm/NSQ_neon.c            | 130 +++++++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.h            | 101 ++++++++++++++++++++++++++++++++
 silk/arm/SigProc_FIX_arm64.h   |  46 +++++++++++++++
 silk/arm/macros_arm64.h        |  66 +++++++++++++++++++++
 silk/macros.h                  |   4 ++
 silk/mips/NSQ_del_dec_mipsr1.h |   3 +-
 silk/x86/NSQ_sse.c             |   2 +-
 silk/x86/main_sse.h            |   3 +-
 silk_headers.mk                |   4 ++
 silk_sources.mk                |   2 +
 22 files changed, 685 insertions(+), 111 deletions(-)
 create mode 100644 celt/arm/fixed_arm64.h
 create mode 100644 silk/NSQ.h
 create mode 100644 silk/arm/NSQ_neon.c
 create mode 100644 silk/arm/NSQ_neon.h
 create mode 100644 silk/arm/SigProc_FIX_arm64.h
 create mode 100644 silk/arm/macros_arm64.h
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 1/8] Move ARM-specific macro overrides to arm-specific file.
---
 celt/arm/pitch_arm.h | 19 +++++++++++++++++++
 celt/pitch.h         | 19 -------------------
 2 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index d5c9408..fe76f8d 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -75,4 +75,23 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const
opus_val16 *_y,
 #endif
 
 #endif /* end !FIXED_POINT */
+
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && \
+  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+extern
+#  if defined(FIXED_POINT)
+opus_val32
+#  else
+void
+#  endif
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int);
+
+#  define OVERRIDE_PITCH_XCORR
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch))
+# endif
+
 #endif
diff --git a/celt/pitch.h b/celt/pitch.h
index dde48c8..bbe790e 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -186,24 +186,6 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16
*_y,
       opus_val32 *xcorr, int len, int max_pitch);
 
 #if !defined(OVERRIDE_PITCH_XCORR)
-/*Is run-time CPU detection enabled on this platform?*/
-# if defined(OPUS_HAVE_RTCD) && \
-  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
-extern
-#  if defined(FIXED_POINT)
-opus_val32
-#  else
-void
-#  endif
-(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
-
-#  define OVERRIDE_PITCH_XCORR
-#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
-# else
-
 #ifdef FIXED_POINT
 opus_val32
 #else
@@ -212,7 +194,6 @@ void
 celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
       opus_val32 *xcorr, int len, int max_pitch, int arch);
 
-# endif
 #endif
 
 #endif
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 2/8] Reorganize pitch_arm.h, so RTCD works for intrinsics functions as well.
---
 celt/arm/arm_celt_map.c | 24 +++++++++++-
 celt/arm/pitch_arm.h    | 97 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 88 insertions(+), 33 deletions(-)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 0c9acff..cc6b706 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -94,9 +94,14 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const
mdct_lookup *l,
    clt_mdct_backward_neon,        /* Neon with NE10 */
 };
 
-#endif
+#endif /* HAVE_ARM_NE10 */
 
 # if defined(FIXED_POINT)
+
+#if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_EDSP)
+
+#if !defined(OPUS_ARM_PRESUME_NEON) && (!defined(OPUS_ARM_PRESUME_EDSP)
|| defined(OPUS_ARM_MAY_HAVE_NEON))
+
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
     const opus_val16 *, opus_val32 *, int , int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
@@ -104,8 +109,23 @@ opus_val32 (*const
CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
   MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
 };
+
+#endif
+
+#elif defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int, int) = {
+  celt_pitch_xcorr_c,              /* ARMv4 */
+  celt_pitch_xcorr_c,              /* EDSP */
+  celt_pitch_xcorr_c,              /* Media */
+  celt_pitch_xcorr_fixed_neon      /* Neon */
+};
+
+#endif
+
 # else /* !FIXED_POINT */
-#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
     const opus_val16 *, opus_val32 *, int, int) = {
   celt_pitch_xcorr_c,              /* ARMv4 */
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index fe76f8d..fa2dd17 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -32,59 +32,89 @@
 
 # if defined(FIXED_POINT)
 
-#if defined(CPU_AARCH64)
-#define OVERRIDE_PITCH_XCORR (1)
-opus_val32 celt_pitch_xcorr_fixed_neon(const opus_val16 *_x, const opus_val16
*_y,
-                                       opus_val32 *xcorr, int len, int
max_pitch);
-#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-   ((void)(arch), celt_pitch_xcorr_fixed_neon(_x, _y, xcorr, len, max_pitch))
-
-#else /* End CPU_AARCH64. Begin CPU_ARM */
+#if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_EDSP)
+/* ARM assembly */
 
-#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+#if defined(OPUS_ARM_MAY_HAVE_NEON)
 opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
     opus_val32 *xcorr, int len, int max_pitch);
 #  endif
 
-#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
-#   define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
-#  endif
+#if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
+#endif
 
 #  if defined(OPUS_ARM_MAY_HAVE_EDSP)
 opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
     opus_val32 *xcorr, int len, int max_pitch);
 #  endif
 
-#  if !defined(OPUS_HAVE_RTCD)
-#   define OVERRIDE_PITCH_XCORR (1)
+#if defined(OPUS_ARM_PRESUME_NEON)
+
+#define OVERRIDE_PITCH_XCORR (1)
 #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
-#  endif
-#endif /* End CPU_ARM */
+   ((void)(arch),celt_pitch_xcorr_neon(_x, _y, xcorr, len, max_pitch))
+
+#elif defined(OPUS_ARM_PRESUME_EDSP) &&
!defined(OPUS_ARM_MAY_HAVE_NEON)
+
+#define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+   ((void)(arch),celt_pitch_xcorr_edsp(_x, _y, xcorr, len, max_pitch))
+
+#elif defined(OPUS_HAVE_RTCD)
+
+extern opus_val32
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int);
+
+#  define OVERRIDE_PITCH_XCORR
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch))
+
+#endif /* End OPUS_HAVE_RTCD */
+
+#elif defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+/* Neon intrinsics */
+
+opus_val32 celt_pitch_xcorr_fixed_neon(const opus_val16 *_x, const opus_val16
*_y,
+                                       opus_val32 *xcorr, int len, int
max_pitch);
+
+#if defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+#define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+   ((void)(arch),celt_pitch_xcorr_fixed_neon(_x, _y, xcorr, len, max_pitch))
+
+#elif defined(OPUS_HAVE_RTCD)
+
+extern opus_val32
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int);
+
+#  define OVERRIDE_PITCH_XCORR
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch))
+
+#endif /* End OPUS_HAVE_RTCD */
+
+#endif /* End OPUS_ARM_MAY_HAVE_NEON_INTR */
 
 #else /* Start !FIXED_POINT */
-/* Float case */
+
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
                                  opus_val32 *xcorr, int len, int max_pitch);
 #if defined(OPUS_ARM_PRESUME_NEON_INTR)
+
 #define OVERRIDE_PITCH_XCORR (1)
 #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
    ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
-#endif
-#endif
 
-#endif /* end !FIXED_POINT */
+#elif defined(OPUS_HAVE_RTCD)
 
-/*Is run-time CPU detection enabled on this platform?*/
-# if defined(OPUS_HAVE_RTCD) && \
-  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
-extern
-#  if defined(FIXED_POINT)
-opus_val32
-#  else
-void
-#  endif
+extern void
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
       const opus_val16 *, opus_val32 *, int, int);
 
@@ -92,6 +122,11 @@ void
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
         xcorr, len, max_pitch))
-# endif
+
+#endif /* End OPUS_HAVE_RTCD */
+
+#endif /* end OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+#endif /* end !FIXED_POINT */
 
 #endif
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 3/8] Autoconf changes for arm64 inline assembly support.
---
 configure.ac | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
diff --git a/configure.ac b/configure.ac
index 019d36d..13ed33f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -199,6 +199,25 @@ cpu_arm=no
 AS_IF([test x"${enable_asm}" = x"yes"],[
     inline_optimization="No inline ASM for your platform, please send
patches"
     case $host_cpu in
+      arm64*|aarch64*)
+      dnl Currently we only have asm for fixed-point
+      AS_IF([test "$enable_float" != "yes"],[
+            cpu_arm64=yes
+            AC_DEFINE([OPUS_ARM64_ASM], [],  [Make use of ARM64 asm
optimization])
+            AS_GCC_INLINE_ASSEMBLY(
+                [inline_optimization="ARM64"],
+                [inline_optimization="disabled"]
+            )
+            AS_IF([test x"$inline_optimization" =
x"ARM64"],[
+                AC_DEFINE([OPUS_ARM64_INLINE_ASM], 1,
+                    [Use ARM64 inline asm optimizations])
+                ])
+                dnl Don't yet have external asm for arm64
+                asm_optimization="disabled"
+                dnl Don't need RTCD for arm64
+                rtcd_support=no
+             ])
+      ;;
       arm*)
         dnl Currently we only have asm for fixed-point
         AS_IF([test "$enable_float" != "yes"],[
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 4/8] Arm64 assembly for Celt fixed-point math.
---
 celt/arch.h            |  2 ++
 celt/arm/fixed_arm64.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 celt_headers.mk        |  1 +
 3 files changed, 78 insertions(+)
 create mode 100644 celt/arm/fixed_arm64.h
diff --git a/celt/arch.h b/celt/arch.h
index 9f74ddd..219569b 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -122,6 +122,8 @@ static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
 #include "arm/fixed_armv5e.h"
 #elif defined (OPUS_ARM_INLINE_ASM)
 #include "arm/fixed_armv4.h"
+#elif defined (OPUS_ARM64_INLINE_ASM)
+#include "arm/fixed_arm64.h"
 #elif defined (BFIN_ASM)
 #include "fixed_bfin.h"
 #elif defined (TI_C5X_ASM)
diff --git a/celt/arm/fixed_arm64.h b/celt/arm/fixed_arm64.h
new file mode 100644
index 0000000..7fa413b
--- /dev/null
+++ b/celt/arm/fixed_arm64.h
@@ -0,0 +1,75 @@
+/* Copyright (C) 2014 Vidyo */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARM64_H
+#define FIXED_ARM64_H
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32
bits */
+#undef MULT16_32_Q16
+static OPUS_INLINE opus_val32 MULT16_32_Q16_arm64(opus_val16 a, opus_val32 b)
+{
+  opus_int64 rd;
+  __asm__(
+      "smull %x0, %w1, %w2\n\t"
+      : "=&r"(rd)
+      : "%r"(b), "r"(a<<16)
+  );
+  return (rd >> 32);
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_arm64(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32
bits */
+#undef MULT16_32_Q15
+static OPUS_INLINE opus_val32 MULT16_32_Q15_arm64(opus_val16 a, opus_val32 b)
+{
+  opus_int64 rd;
+  __asm__(
+      "smull %x0, %w1, %w2\n\t"
+      : "=&r"(rd)
+      : "%r"(b), "r"(a<<16)
+  );
+  return ((rd >> 32) << 1);
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_arm64(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
+
+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q16
+#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))
+
+/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32
bits */
+#undef MULT32_32_Q31
+#define MULT32_32_Q31(a,b) (opus_val32)((((opus_int64)(a)) *
((opus_int64)(b)))>>31)
+
+#endif
diff --git a/celt_headers.mk b/celt_headers.mk
index 0eca6e6..c9df94b 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -36,6 +36,7 @@ celt/static_modes_fixed_arm_ne10.h \
 celt/arm/armcpu.h \
 celt/arm/fixed_armv4.h \
 celt/arm/fixed_armv5e.h \
+celt/arm/fixed_arm64.h \
 celt/arm/kiss_fft_armv4.h \
 celt/arm/kiss_fft_armv5e.h \
 celt/arm/pitch_arm.h \
-- 
2.3.2 (Apple Git-55)
---
 silk/SigProc_FIX.h           |  4 +++
 silk/arm/SigProc_FIX_arm64.h | 46 ++++++++++++++++++++++++++++++
 silk/arm/macros_arm64.h      | 66 ++++++++++++++++++++++++++++++++++++++++++++
 silk/macros.h                |  4 +++
 silk_headers.mk              |  2 ++
 5 files changed, 122 insertions(+)
 create mode 100644 silk/arm/SigProc_FIX_arm64.h
 create mode 100644 silk/arm/macros_arm64.h
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index b632994..0a6969d 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -603,6 +603,10 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a,
opus_int64 b)
 #include "arm/SigProc_FIX_armv5e.h"
 #endif
 
+#ifdef OPUS_ARM64_INLINE_ASM
+#include "arm/SigProc_FIX_arm64.h"
+#endif
+
 #if defined(MIPSr1_ASM)
 #include "mips/sigproc_fix_mipsr1.h"
 #endif
diff --git a/silk/arm/SigProc_FIX_arm64.h b/silk/arm/SigProc_FIX_arm64.h
new file mode 100644
index 0000000..faa5900
--- /dev/null
+++ b/silk/arm/SigProc_FIX_arm64.h
@@ -0,0 +1,46 @@
+/***********************************************************************
+Copyright (C) 2014 Vidyo
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_SIGPROC_FIX_ARM64_H
+#define SILK_SIGPROC_FIX_ARM64_H
+
+#undef silk_MLA
+static OPUS_INLINE opus_int32 silk_MLA_arm64(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  opus_int32 res;
+  __asm__(
+      "#silk_MLA\n\t"
+      "madd %w0, %w1, %w2, %w3\n\t"
+      : "=&r"(res)
+      : "r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_MLA(a, b, c) (silk_MLA_arm64(a, b, c))
+
+#endif
diff --git a/silk/arm/macros_arm64.h b/silk/arm/macros_arm64.h
new file mode 100644
index 0000000..fe794de
--- /dev/null
+++ b/silk/arm/macros_arm64.h
@@ -0,0 +1,66 @@
+/***********************************************************************
+Copyright (C) 2014 Vidyo
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_MACROS_ARM64_H
+#define SILK_MACROS_ARM64_H
+
+/* (a32 * b32) >> 16 */
+#undef silk_SMULWW
+static OPUS_INLINE opus_int32 silk_SMULWW_arm64(opus_int32 a, opus_int32 b)
+{
+  opus_int64 rd;
+  __asm__(
+    "#silk_SMULWW\n\t"
+    "smull %x0, %w1, %w2\n\t"
+    : "=&r"(rd)
+    : "%r"(a), "r"(b)
+  );
+  rd >>= 16;
+  rd &= 0xFFFFFFFF;
+  return rd;
+}
+#define silk_SMULWW(a, b) (silk_SMULWW_arm64(a, b))
+
+#undef silk_SMLAWW
+static OPUS_INLINE opus_int32 silk_SMLAWW_arm64(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  opus_int64 rd;
+  __asm__(
+    "#silk_SMLAWW\n\t"
+    "smull %x0, %w1, %w2\n\t"
+    : "=&r"(rd)
+    : "%r"(b), "r"(c)
+  );
+  rd >>= 16;
+  rd &= 0xFFFFFFFF;
+
+  return a + rd;
+}
+#define silk_SMLAWW(a, b, c) (silk_SMLAWW_arm64(a, b, c))
+
+#endif /* SILK_MACROS_ARM64_H */
diff --git a/silk/macros.h b/silk/macros.h
index bc30303..e7ab7f5 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -149,5 +149,9 @@ static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32)
 #include "arm/macros_armv5e.h"
 #endif
 
+#ifdef OPUS_ARM64_INLINE_ADM
+#include "arm/macros_arm64.h"
+#endif
+
 #endif /* SILK_MACROS_H */
 
diff --git a/silk_headers.mk b/silk_headers.mk
index 679ff8f..6676133 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -23,8 +23,10 @@ silk/SigProc_FIX.h \
 silk/x86/SigProc_FIX_sse.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
+silk/arm/macros_arm64.h \
 silk/arm/SigProc_FIX_armv4.h \
 silk/arm/SigProc_FIX_armv5e.h \
+silk/arm/SigProc_FIX_arm64.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
 silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 6/8] Add Neon intrinsics for Silk noise shape quantization.
---
 Makefile.am         |  8 +++--
 silk/NSQ.c          | 37 ++++++++--------------
 silk/NSQ.h          | 70 +++++++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.c | 64 +++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 silk/x86/NSQ_sse.c  |  2 +-
 silk/x86/main_sse.h |  3 +-
 silk_headers.mk     |  2 ++
 silk_sources.mk     |  2 ++
 9 files changed, 250 insertions(+), 29 deletions(-)
 create mode 100644 silk/NSQ.h
 create mode 100644 silk/arm/NSQ_neon.c
 create mode 100644 silk/arm/NSQ_neon.h
diff --git a/Makefile.am b/Makefile.am
index a7e7465..d819f38 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -49,6 +49,7 @@ SILK_SOURCES += $(SILK_SOURCES_ARM)
 
 if HAVE_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
+SILK_SOURCES += $(SILK_SOURCES_ARM_NEON_INTR)
 endif
 
 if HAVE_ARM_NE10
@@ -299,7 +300,8 @@ $(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_X86_SSE4_1_CFLAGS)
 endif
 
 if HAVE_ARM_NEON_INTR
-CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
-                         $(CELT_SOURCES_ARM_NE10:.c=.lo)
-$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS)
+ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
+                    $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) \
+                    $(CELT_SOURCES_ARM_NE10:.c=.lo)
+$(ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS)
 endif
diff --git a/silk/NSQ.c b/silk/NSQ.c
index a065884..d8513dc 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -31,6 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main.h"
 #include "stack_alloc.h"
+#include "NSQ.h"
+
 
 static OPUS_INLINE void silk_nsq_scale_states(
     const silk_encoder_state *psEncC,           /* I    Encoder State          
*/
@@ -66,7 +68,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
     opus_int            offset_Q10,             /* I                           
*/
     opus_int            length,                 /* I    Input length           
*/
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter
order   */
-    opus_int            predictLPCOrder         /* I    Prediction filter order
*/
+    opus_int            predictLPCOrder,        /* I    Prediction filter order
*/
+    int                 arch                    /* I    Architecture           
*/
 );
 #endif
 
@@ -155,7 +158,7 @@ void silk_NSQ_c
 
         silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10,
pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
             AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[
k ], Gains_Q16[ k ], Lambda_Q10,
-            offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder );
+            offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder, psEncC->arch );
 
         x_Q3   += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
@@ -198,7 +201,8 @@ void silk_noise_shape_quantizer(
     opus_int            offset_Q10,             /* I                           
*/
     opus_int            length,                 /* I    Input length           
*/
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter
order   */
-    opus_int            predictLPCOrder         /* I    Prediction filter order
*/
+    opus_int            predictLPCOrder,        /* I    Prediction filter order
*/
+    int                 arch                    /* I    Architecture           
*/
 )
 {
     opus_int     i, j;
@@ -207,6 +211,9 @@ void silk_noise_shape_quantizer(
     opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
+#ifdef OPUS_ARM_MAY_HAVE_NEON_INTR
+    opus_int32   a_Q12_rev[16];
+#endif
 
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag +
HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -215,32 +222,14 @@ void silk_noise_shape_quantizer(
     /* Set up short term AR state */
     psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
 
+    optional_coef_reversal(a_Q12_rev, a_Q12, predictLPCOrder);
+
     for( i = 0; i < length; i++ ) {
         /* Generate dither */
         NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
 
         /* Short-term prediction */
-        silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
-        /* Avoids introducing a bias because silk_SMLAWB() always rounds to
-inf */
-        LPC_pred_Q10 = silk_RSHIFT( predictLPCOrder, 1 );
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[  0 ], a_Q12[ 0 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -1 ], a_Q12[ 1 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -2 ], a_Q12[ 2 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -3 ], a_Q12[ 3 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -4 ], a_Q12[ 4 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -5 ], a_Q12[ 5 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -6 ], a_Q12[ 6 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -7 ], a_Q12[ 7 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -8 ], a_Q12[ 8 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -9 ], a_Q12[ 9 ]
);
-        if( predictLPCOrder == 16 ) {
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -10 ], a_Q12[
10 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -11 ], a_Q12[
11 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -12 ], a_Q12[
12 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -13 ], a_Q12[
13 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -14 ], a_Q12[
14 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -15 ], a_Q12[
15 ] );
-        }
+        LPC_pred_Q10 = silk_noise_shape_quantizer_short_prediction(psLPC_Q14,
a_Q12, a_Q12_rev, predictLPCOrder, arch);
 
         /* Long-term prediction */
         if( signalType == TYPE_VOICED ) {
diff --git a/silk/NSQ.h b/silk/NSQ.h
new file mode 100644
index 0000000..a18a951
--- /dev/null
+++ b/silk/NSQ.h
@@ -0,0 +1,70 @@
+/***********************************************************************
+Copyright (c) 2014 Vidyo.
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifndef SILK_NSQ_H
+#define SILK_NSQ_H
+
+#define optional_coef_reversal(out, in, order)
+
+static OPUS_INLINE opus_int32
silk_noise_shape_quantizer_short_prediction_c(const opus_int32 *buf32, const
opus_int16 *coef16, opus_int order)
+{
+    opus_int32 out;
+    silk_assert( order == 10 || order == 16 );
+
+    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+    out = silk_RSHIFT( order, 1 );
+    out = silk_SMLAWB( out, buf32[  0 ], coef16[ 0 ] );
+    out = silk_SMLAWB( out, buf32[ -1 ], coef16[ 1 ] );
+    out = silk_SMLAWB( out, buf32[ -2 ], coef16[ 2 ] );
+    out = silk_SMLAWB( out, buf32[ -3 ], coef16[ 3 ] );
+    out = silk_SMLAWB( out, buf32[ -4 ], coef16[ 4 ] );
+    out = silk_SMLAWB( out, buf32[ -5 ], coef16[ 5 ] );
+    out = silk_SMLAWB( out, buf32[ -6 ], coef16[ 6 ] );
+    out = silk_SMLAWB( out, buf32[ -7 ], coef16[ 7 ] );
+    out = silk_SMLAWB( out, buf32[ -8 ], coef16[ 8 ] );
+    out = silk_SMLAWB( out, buf32[ -9 ], coef16[ 9 ] );
+
+    if( order == 16 )
+    {
+        out = silk_SMLAWB( out, buf32[ -10 ], coef16[ 10 ] );
+        out = silk_SMLAWB( out, buf32[ -11 ], coef16[ 11 ] );
+        out = silk_SMLAWB( out, buf32[ -12 ], coef16[ 12 ] );
+        out = silk_SMLAWB( out, buf32[ -13 ], coef16[ 13 ] );
+        out = silk_SMLAWB( out, buf32[ -14 ], coef16[ 14 ] );
+        out = silk_SMLAWB( out, buf32[ -15 ], coef16[ 15 ] );
+    }
+    return out;
+}
+
+#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  ((void)arch,silk_noise_shape_quantizer_short_prediction_c(in, coef,
order))
+
+
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#include "arm/NSQ_neon.h"
+#endif
+
+#endif /* SILK_NSQ_H */
diff --git a/silk/arm/NSQ_neon.c b/silk/arm/NSQ_neon.c
new file mode 100644
index 0000000..96b672d
--- /dev/null
+++ b/silk/arm/NSQ_neon.c
@@ -0,0 +1,64 @@
+/***********************************************************************
+Copyright (C) 2014 Vidyo
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "main.h"
+#include "stack_alloc.h"
+#include "NSQ.h"
+#include "celt/cpu_support.h"
+#include "celt/arm/armcpu.h"
+
+opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32
*buf32, const opus_int32 *coef32)
+{
+    int32x4_t coef0 = vld1q_s32(coef32);
+    int32x4_t coef1 = vld1q_s32(coef32 + 4);
+    int32x4_t coef2 = vld1q_s32(coef32 + 8);
+    int32x4_t coef3 = vld1q_s32(coef32 + 12);
+
+    int32x4_t a0 = vld1q_s32(buf32 - 15);
+    int32x4_t a1 = vld1q_s32(buf32 - 11);
+    int32x4_t a2 = vld1q_s32(buf32 - 7);
+    int32x4_t a3 = vld1q_s32(buf32 - 3);
+
+    int64x2_t b0 = vmull_s32(vget_low_s32(a0), vget_low_s32(coef0));
+    int64x2_t b1 = vmlal_s32(b0, vget_high_s32(a0), vget_high_s32(coef0));
+    int64x2_t b2 = vmlal_s32(b1, vget_low_s32(a1), vget_low_s32(coef1));
+    int64x2_t b3 = vmlal_s32(b2, vget_high_s32(a1), vget_high_s32(coef1));
+    int64x2_t b4 = vmlal_s32(b3, vget_low_s32(a2), vget_low_s32(coef2));
+    int64x2_t b5 = vmlal_s32(b4, vget_high_s32(a2), vget_high_s32(coef2));
+    int64x2_t b6 = vmlal_s32(b5, vget_low_s32(a3), vget_low_s32(coef3));
+    int64x2_t b7 = vmlal_s32(b6, vget_high_s32(a3), vget_high_s32(coef3));
+
+    int64x1_t c = vadd_s64(vget_low_s64(b7), vget_high_s64(b7));
+    int64x1_t cS = vshr_n_s64(c, 16);
+    int32x2_t d = vreinterpret_s32_s64(cS);
+    opus_int32 out = vget_lane_s32(d, 0);
+    return out;
+}
diff --git a/silk/arm/NSQ_neon.h b/silk/arm/NSQ_neon.h
new file mode 100644
index 0000000..8e67cb9
--- /dev/null
+++ b/silk/arm/NSQ_neon.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+Copyright (C) 2014 Vidyo
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifndef SILK_NSQ_NEON_H
+#define SILK_NSQ_NEON_H
+
+#undef optional_coef_reversal
+// reverse a_Q12 coefs to make calc easier, convert to 32
+static OPUS_INLINE void optional_coef_reversal_neon(opus_int32 *out, const
opus_int16 *in, opus_int order)
+{
+    out[15] = in[0];
+    out[14] = in[1];
+    out[13] = in[2];
+    out[12] = in[3];
+    out[11] = in[4];
+    out[10] = in[5];
+    out[9]  = in[6];
+    out[8]  = in[7];
+    out[7]  = in[8];
+    out[6]  = in[9];
+
+    if (order == 16)
+    {
+        out[5] = in[10];
+        out[4] = in[11];
+        out[3] = in[12];
+        out[2] = in[13];
+        out[1] = in[14];
+        out[0] = in[15];
+    }
+    else
+    {
+        out[5] = 0;
+        out[4] = 0;
+        out[3] = 0;
+        out[2] = 0;
+        out[1] = 0;
+        out[0] = 0;
+    }
+}
+
+#if OPUS_ARM_PRESUME_NEON_INTR
+
+#define optional_coef_reversal(out, in, order)
(optional_coef_reversal_neon(out, in, order))
+
+#elif OPUS_HAVE_RTCD
+
+#define optional_coef_reversal(out, in, order) do { if (arch == 3) {
optional_coef_reversal_neon(out, in, order); } } while (0)
+
+#endif
+
+opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32
*buf32, const opus_int32 *coef32);
+
+#if OPUS_ARM_PRESUME_NEON_INTR
+#undef silk_noise_shape_quantizer_short_prediction
+#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  ((void)arch,silk_noise_shape_quantizer_short_prediction_neon(in,
coefRev))
+
+#elif OPUS_HAVE_RTCD
+
+/* silk_noise_shape_quantizer_short_prediction implementations take different
parameters based on arch
+   (coef vs. coefRev) so can't use the usual IMPL table implementation */
+#undef silk_noise_shape_quantizer_short_prediction
+#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  (arch == 3 ? silk_noise_shape_quantizer_short_prediction_neon(in,
coefRev) : silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
+
+
+#endif
+
+#endif /* SILK_NSQ_NEON_H */
diff --git a/silk/x86/NSQ_sse.c b/silk/x86/NSQ_sse.c
index 72f34fd..bb3c5f1 100644
--- a/silk/x86/NSQ_sse.c
+++ b/silk/x86/NSQ_sse.c
@@ -221,7 +221,7 @@ void silk_NSQ_sse4_1(
         {
             silk_noise_shape_quantizer( NSQ, psIndices->signalType,
x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
                 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ],
LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
-                offset_Q10, psEncC->subfr_length,
psEncC->shapingLPCOrder, psEncC->predictLPCOrder );
+                offset_Q10, psEncC->subfr_length,
psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
         }
 
         x_Q3   += psEncC->subfr_length;
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index afd5ec2..d8d6131 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -207,7 +207,8 @@ void silk_noise_shape_quantizer(
     opus_int            offset_Q10,             /* I                           
*/
     opus_int            length,                 /* I    Input length           
*/
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter
order   */
-    opus_int            predictLPCOrder         /* I    Prediction filter order
*/
+    opus_int            predictLPCOrder,        /* I    Prediction filter order
*/
+    int                 arch                    /* I    Architecture           
*/
 );
 
 /**************************/
diff --git a/silk_headers.mk b/silk_headers.mk
index 6676133..731f1ed 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -15,6 +15,7 @@ silk/Inlines.h \
 silk/MacroCount.h \
 silk/MacroDebug.h \
 silk/macros.h \
+silk/NSQ.h \
 silk/pitch_est_defines.h \
 silk/resampler_private.h \
 silk/resampler_rom.h \
@@ -27,6 +28,7 @@ silk/arm/macros_arm64.h \
 silk/arm/SigProc_FIX_armv4.h \
 silk/arm/SigProc_FIX_armv5e.h \
 silk/arm/SigProc_FIX_arm64.h \
+silk/arm/NSQ_neon.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
 silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 7cfb7d3..79ac6f0 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -82,6 +82,8 @@ silk/x86/x86_silk_map.c \
 silk/x86/VAD_sse.c \
 silk/x86/VQ_WMat_EC_sse.c
 
+SILK_SOURCES_ARM_NEON_INTR = silk/arm/NSQ_neon.c
+
 SILK_SOURCES_FIXED = \
 silk/fixed/LTP_analysis_filter_FIX.c \
 silk/fixed/LTP_scale_ctrl_FIX.c \
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 7/8] Add Neon intrinsics for Silk noise shape feedback loop.
---
 silk/NSQ.c          | 18 ++-------------
 silk/NSQ.h          | 27 ++++++++++++++++++++++
 silk/arm/NSQ_neon.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.h | 10 ++++++++
 4 files changed, 105 insertions(+), 16 deletions(-)
diff --git a/silk/NSQ.c b/silk/NSQ.c
index d8513dc..ec81f3b 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -205,7 +205,7 @@ void silk_noise_shape_quantizer(
     int                 arch                    /* I    Architecture           
*/
 )
 {
-    opus_int     i, j;
+    opus_int     i;
     opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
     opus_int32   n_LF_Q12, r_Q10, rr_Q10, q1_Q0, q1_Q10, q2_Q10, rd1_Q20,
rd2_Q20;
     opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
@@ -248,21 +248,7 @@ void silk_noise_shape_quantizer(
 
         /* Noise shape feedback */
         silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order
is even */
-        tmp2 = psLPC_Q14[ 0 ];
-        tmp1 = NSQ->sAR2_Q14[ 0 ];
-        NSQ->sAR2_Q14[ 0 ] = tmp2;
-        n_AR_Q12 = silk_RSHIFT( shapingLPCOrder, 1 );
-        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp2, AR_shp_Q13[ 0 ] );
-        for( j = 2; j < shapingLPCOrder; j += 2 ) {
-            tmp2 = NSQ->sAR2_Q14[ j - 1 ];
-            NSQ->sAR2_Q14[ j - 1 ] = tmp1;
-            n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp1, AR_shp_Q13[ j - 1 ] );
-            tmp1 = NSQ->sAR2_Q14[ j + 0 ];
-            NSQ->sAR2_Q14[ j + 0 ] = tmp2;
-            n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp2, AR_shp_Q13[ j ] );
-        }
-        NSQ->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
-        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp1, AR_shp_Q13[ shapingLPCOrder - 1
] );
+        n_AR_Q12 = silk_NSQ_noise_shape_feedback_loop(psLPC_Q14,
NSQ->sAR2_Q14, AR_shp_Q13, shapingLPCOrder, arch);
 
         n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                               
/* Q11 -> Q12 */
         n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sLF_AR_shp_Q14, Tilt_Q14 );
diff --git a/silk/NSQ.h b/silk/NSQ.h
index a18a951..df856e6 100644
--- a/silk/NSQ.h
+++ b/silk/NSQ.h
@@ -62,6 +62,33 @@ static OPUS_INLINE opus_int32
silk_noise_shape_quantizer_short_prediction_c(cons
 
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  ((void)arch,silk_noise_shape_quantizer_short_prediction_c(in, coef,
order))
 
+static OPUS_INLINE opus_int32 silk_NSQ_noise_shape_feedback_loop_c(const
opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef, opus_int order)
+{
+    opus_int32 out;
+    opus_int32 tmp1, tmp2;
+    opus_int j;
+
+    tmp2 = data0[0];
+    tmp1 = data1[0];
+    data1[0] = tmp2;
+
+    out = silk_RSHIFT(order, 1);
+    out = silk_SMLAWB(out, tmp2, coef[0]);
+
+    for (j = 2; j < order; j += 2) {
+        tmp2 = data1[j - 1];
+        data1[j - 1] = tmp1;
+        out = silk_SMLAWB(out, tmp1, coef[j - 1]);
+        tmp1 = data1[j + 0];
+        data1[j + 0] = tmp2;
+        out = silk_SMLAWB(out, tmp2, coef[j]);
+    }
+    data1[order - 1] = tmp1;
+    out = silk_SMLAWB(out, tmp1, coef[order - 1]);
+    return out;
+}
+
+#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) 
((void)arch,silk_NSQ_noise_shape_feedback_loop_c(data0, data1, coef, order))
 
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #include "arm/NSQ_neon.h"
diff --git a/silk/arm/NSQ_neon.c b/silk/arm/NSQ_neon.c
index 96b672d..fb858f3 100644
--- a/silk/arm/NSQ_neon.c
+++ b/silk/arm/NSQ_neon.c
@@ -62,3 +62,69 @@ opus_int32
silk_noise_shape_quantizer_short_prediction_neon(const opus_int32 *bu
     opus_int32 out = vget_lane_s32(d, 0);
     return out;
 }
+
+
+opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0,
opus_int32 *data1, const opus_int16 *coef, opus_int order)
+{
+    opus_int32 out;
+    if (order == 8)
+    {
+        int32x4_t a00 = vdupq_n_s32(data0[0]);
+        int32x4_t a01 = vld1q_s32(data1);  // data1[0] ... [3]
+
+        int32x4_t a0 = vextq_s32 (a00, a01, 3); // data0[0] data1[0] ...[2]
+        int32x4_t a1 = vld1q_s32(data1 + 3);  // data1[3] ... [6]
+
+        int16x8_t coef16 = vld1q_s16(coef);
+        int32x4_t coef0 = vmovl_s16(vget_low_s16(coef16));
+        int32x4_t coef1 = vmovl_s16(vget_high_s16(coef16));
+
+        int64x2_t b0 = vmull_s32(vget_low_s32(a0), vget_low_s32(coef0));
+        int64x2_t b1 = vmlal_s32(b0, vget_high_s32(a0), vget_high_s32(coef0));
+        int64x2_t b2 = vmlal_s32(b1, vget_low_s32(a1), vget_low_s32(coef1));
+        int64x2_t b3 = vmlal_s32(b2, vget_high_s32(a1), vget_high_s32(coef1));
+
+        int64x1_t c = vadd_s64(vget_low_s64(b3), vget_high_s64(b3));
+        int64x1_t cS = vshr_n_s64(c, 16);
+        int32x2_t d = vreinterpret_s32_s64(cS);
+
+        out = vget_lane_s32(d, 0);
+        vst1q_s32(data1, a0);
+        vst1q_s32(data1 + 4, a1);
+    }
+    else
+    {
+        opus_int32 tmp1, tmp2;
+        opus_int j;
+
+        tmp2 = data0[0];
+        tmp1 = data1[0];
+        data1[0] = tmp2;
+
+        out = silk_RSHIFT(order, 1);
+        out = silk_SMLAWB(out, tmp2, coef[0]);
+
+        for (j = 2; j < order; j += 2) {
+            tmp2 = data1[j - 1];
+            data1[j - 1] = tmp1;
+            out = silk_SMLAWB(out, tmp1, coef[j - 1]);
+            tmp1 = data1[j + 0];
+            data1[j + 0] = tmp2;
+            out = silk_SMLAWB(out, tmp2, coef[j]);
+        }
+        data1[order - 1] = tmp1;
+        out = silk_SMLAWB(out, tmp1, coef[order - 1]);
+    }
+    return out;
+}
+
+#if !defined(OPUS_ARM_PRESUME_NEON_INTR) && defined(OPUS_HAVE_RTCD)
+
+opus_int32 (*const
SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL[OPUS_ARCHMASK+1])(const opus_int32
*data0, opus_int32 *data1, const opus_int16 *coef, opus_int order) = {
+    silk_NSQ_noise_shape_feedback_loop_c,
+    silk_NSQ_noise_shape_feedback_loop_c,
+    silk_NSQ_noise_shape_feedback_loop_c,
+    silk_NSQ_noise_shape_feedback_loop_neon,
+};
+
+#endif
diff --git a/silk/arm/NSQ_neon.h b/silk/arm/NSQ_neon.h
index 8e67cb9..24db2a6 100644
--- a/silk/arm/NSQ_neon.h
+++ b/silk/arm/NSQ_neon.h
@@ -74,10 +74,15 @@ static OPUS_INLINE void
optional_coef_reversal_neon(opus_int32 *out, const opus_
 
 opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32
*buf32, const opus_int32 *coef32);
 
+opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0,
opus_int32 *data1, const opus_int16 *coef, opus_int order);
+
 #if OPUS_ARM_PRESUME_NEON_INTR
 #undef silk_noise_shape_quantizer_short_prediction
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  ((void)arch,silk_noise_shape_quantizer_short_prediction_neon(in,
coefRev))
 
+#undef silk_NSQ_noise_shape_feedback_loop
+#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) 
((void)arch,silk_NSQ_noise_shape_feedback_loop_neon(data0, data1, coef, order))
+
 #elif OPUS_HAVE_RTCD
 
 /* silk_noise_shape_quantizer_short_prediction implementations take different
parameters based on arch
@@ -85,6 +90,11 @@ opus_int32
silk_noise_shape_quantizer_short_prediction_neon(const opus_int32 *bu
 #undef silk_noise_shape_quantizer_short_prediction
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  (arch == 3 ? silk_noise_shape_quantizer_short_prediction_neon(in,
coefRev) : silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
 
+extern opus_int32 (*const
SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL[OPUS_ARCHMASK+1])(const opus_int32
*data0, opus_int32 *data1, const opus_int16 *coef, opus_int order);
+
+#undef silk_NSQ_noise_shape_feedback_loop
+#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) 
(SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL[(arch)&OPUS_ARCHMASK](data0,
data1, coef, order))
+
 
 #endif
 
-- 
2.3.2 (Apple Git-55)
Jonathan Lennox
2015-Aug-05  18:17 UTC
[opus] [PATCH 8/8] Apply Neon short prediction optimization to silk_noise_shape_quantizer_del_dec.
---
 silk/NSQ_del_dec.c             | 37 +++++++++++++------------------------
 silk/mips/NSQ_del_dec_mipsr1.h |  3 ++-
 2 files changed, 15 insertions(+), 25 deletions(-)
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index aff560c..aaa1fca 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -31,6 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main.h"
 #include "stack_alloc.h"
+#include "NSQ.h"
+
 
 typedef struct {
     opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
@@ -106,7 +108,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     opus_int            warping_Q16,            /* I                           
*/
     opus_int            nStatesDelayedDecision, /* I    Number of states in
decision tree   */
     opus_int            *smpl_buf_idx,          /* I    Index to newest samples
in buffers  */
-    opus_int            decisionDelay           /* I                           
*/
+    opus_int            decisionDelay,          /* I                           
*/
+    int                 arch                    /* I                           
*/
 );
 
 void silk_NSQ_del_dec_c(
@@ -260,7 +263,7 @@ void silk_NSQ_del_dec_c(
         silk_noise_shape_quantizer_del_dec( NSQ, psDelDec,
psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
             delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag,
HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length,
subfr++, psEncC->shapingLPCOrder,
-            psEncC->predictLPCOrder, psEncC->warping_Q16,
psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
+            psEncC->predictLPCOrder, psEncC->warping_Q16,
psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay,
psEncC->arch );
 
         x_Q3   += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
@@ -333,7 +336,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     opus_int            warping_Q16,            /* I                           
*/
     opus_int            nStatesDelayedDecision, /* I    Number of states in
decision tree   */
     opus_int            *smpl_buf_idx,          /* I    Index to newest samples
in buffers  */
-    opus_int            decisionDelay           /* I                           
*/
+    opus_int            decisionDelay,          /* I                           
*/
+    int                 arch                    /* I                           
*/
 )
 {
     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
@@ -343,6 +347,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+#ifdef OPUS_ARM_MAY_HAVE_NEON_INTR
+    opus_int32   a_Q12_rev[16];
+#endif
     VARDECL( NSQ_sample_pair, psSampleState );
     NSQ_del_dec_struct *psDD;
     NSQ_sample_struct  *psSS;
@@ -355,6 +362,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
 
+    optional_coef_reversal(a_Q12_rev, a_Q12, predictLPCOrder);
+
     for( i = 0; i < length; i++ ) {
         /* Perform common calculations used in all states */
 
@@ -398,27 +407,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
             /* Pointer used in short term prediction and shaping */
             psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
             /* Short-term prediction */
-            silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
-            /* Avoids introducing a bias because silk_SMLAWB() always rounds to
-inf */
-            LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[  0 ], a_Q12[ 0
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -1 ], a_Q12[ 1
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -2 ], a_Q12[ 2
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -3 ], a_Q12[ 3
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -4 ], a_Q12[ 4
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -5 ], a_Q12[ 5
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -6 ], a_Q12[ 6
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -7 ], a_Q12[ 7
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9
] );
-            if( predictLPCOrder == 16 ) {
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -10 ],
a_Q12[ 10 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -11 ],
a_Q12[ 11 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -12 ],
a_Q12[ 12 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -13 ],
a_Q12[ 13 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -14 ],
a_Q12[ 14 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -15 ],
a_Q12[ 15 ] );
-            }
+            LPC_pred_Q14 =
silk_noise_shape_quantizer_short_prediction(psLPC_Q14, a_Q12, a_Q12_rev,
predictLPCOrder, arch);
             LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 );                     
/* Q10 -> Q14 */
 
             /* Noise shape feedback */
diff --git a/silk/mips/NSQ_del_dec_mipsr1.h b/silk/mips/NSQ_del_dec_mipsr1.h
index f6afd92..88e281b 100644
--- a/silk/mips/NSQ_del_dec_mipsr1.h
+++ b/silk/mips/NSQ_del_dec_mipsr1.h
@@ -62,7 +62,8 @@ static inline void silk_noise_shape_quantizer_del_dec(
     opus_int            warping_Q16,            /* I                           
*/
     opus_int            nStatesDelayedDecision, /* I    Number of states in
decision tree   */
     opus_int            *smpl_buf_idx,          /* I    Index to newest samples
in buffers  */
-    opus_int            decisionDelay           /* I                           
*/
+    opus_int            decisionDelay,          /* I                           
*/
+    int                 arch                    /* I                           
*/
 )
 {
     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
-- 
2.3.2 (Apple Git-55)
Maybe Matching Threads
- [Aarch64 v2 07/18] Apply Neon short prediction optimization to silk_noise_shape_quantizer_del_dec.
- [PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
- [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset
- [PATCH 6/8] Add Neon intrinsics for Silk noise shape quantization.
- [Aarch64 v2 05/18] Add Neon intrinsics for Silk noise shape quantization.