thr3ads.net - opus - [opus] [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset [Dec 2015]

If this information is useful, please help other people find it:
Share via:

Jonathan Lennox

2015-Dec-23 00:21 UTC

[opus] [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset

Following Tim's comments, here are my reworked patches for the Neon
intrinsic function patches of
of my Aarch64 patchset, i.e. replacing patches 5-8 of the v2 series.  Patches
1-4 and 9-18 of the
old series still apply unmodified.

The one new (as opposed to changed) patch is the first one in this series, to
add named constants
for the ARM architecture variants.

There are also some minor code style cleanups, notably the removal of C++-style
comments.

Jonathan Lennox (5):
  Add named constants for ARM architecture variants.
  Add Neon intrinsics for Silk noise shape quantization.
  Apply Neon short prediction optimization to
    silk_noise_shape_quantizer_del_dec.
  Add Neon intrinsics for Silk noise shape feedback loop.
  Add Neon fixed-point implementation of xcorr_kernel.

 Makefile.am                    |   5 +-
 celt/arm/arm_celt_map.c        |  17 ++++++
 celt/arm/armcpu.c              |  35 +++++++----
 celt/arm/armcpu.h              |   6 ++
 celt/arm/celt_neon_intr.c      |  61 ++++++++++++++++++-
 celt/arm/pitch_arm.h           |  31 +++++++++-
 silk/NSQ.c                     |  57 ++++++-----------
 silk/NSQ.h                     |  97 +++++++++++++++++++++++++++++
 silk/NSQ_del_dec.c             |  40 +++++-------
 silk/arm/NSQ_neon.c            | 135 +++++++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.h            | 109 +++++++++++++++++++++++++++++++++
 silk/mips/NSQ_del_dec_mipsr1.h |   3 +-
 silk/x86/NSQ_sse.c             |   2 +-
 silk/x86/main_sse.h            |   3 +-
 silk_headers.mk                |   2 +
 silk_sources.mk                |   2 +
 16 files changed, 521 insertions(+), 84 deletions(-)
 create mode 100644 silk/NSQ.h
 create mode 100644 silk/arm/NSQ_neon.c
 create mode 100644 silk/arm/NSQ_neon.h

-- 
2.5.4 (Apple Git-61)

Jonathan Lennox

2015-Dec-23 00:21 UTC

head link

[opus] [AArch64 neon intrinsics v4 1/5] Add named constants for ARM architecture variants.

---
 celt/arm/armcpu.c | 35 ++++++++++++++++++++++-------------
 celt/arm/armcpu.h |  6 ++++++
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index 5e5d10c..09fbc41 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -37,11 +37,12 @@
 #include "cpu_support.h"
 #include "os_support.h"
 #include "opus_types.h"
+#include "arch.h"
 
-#define OPUS_CPU_ARM_V4    (1)
-#define OPUS_CPU_ARM_EDSP  (1<<1)
-#define OPUS_CPU_ARM_MEDIA (1<<2)
-#define OPUS_CPU_ARM_NEON  (1<<3)
+#define OPUS_CPU_ARM_V4_FLAG    (1<<OPUS_ARCH_ARM_V4)
+#define OPUS_CPU_ARM_EDSP_FLAG  (1<<OPUS_ARCH_ARM_EDSP)
+#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
+#define OPUS_CPU_ARM_NEON_FLAG  (1<<OPUS_ARCH_ARM_NEON)
 
 #if defined(_MSC_VER)
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -59,7 +60,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
   __try{
     /*PLD [r13]*/
     __emit(0xF5DDF000);
-    flags|=OPUS_CPU_ARM_EDSP;
+    flags|=OPUS_CPU_ARM_EDSP_FLAG;
   }
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
@@ -68,7 +69,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
   __try{
     /*SHADD8 r3,r3,r3*/
     __emit(0xE6333F93);
-    flags|=OPUS_CPU_ARM_MEDIA;
+    flags|=OPUS_CPU_ARM_MEDIA_FLAG;
   }
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
@@ -77,7 +78,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
   __try{
     /*VORR q0,q0,q0*/
     __emit(0xF2200150);
-    flags|=OPUS_CPU_ARM_NEON;
+    flags|=OPUS_CPU_ARM_NEON_FLAG;
   }
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
@@ -115,13 +116,13 @@ opus_uint32 opus_cpu_capabilities(void)
 #  if defined(OPUS_ARM_MAY_HAVE_EDSP)
         p = strstr(buf, " edsp");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
-          flags |= OPUS_CPU_ARM_EDSP;
+          flags |= OPUS_CPU_ARM_EDSP_FLAG;
 #  endif
 
 #  if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
-          flags |= OPUS_CPU_ARM_NEON;
+          flags |= OPUS_CPU_ARM_NEON_FLAG;
 #  endif
       }
 # endif
@@ -134,7 +135,7 @@ opus_uint32 opus_cpu_capabilities(void)
         version = atoi(buf+17);
 
         if(version >= 6)
-          flags |= OPUS_CPU_ARM_MEDIA;
+          flags |= OPUS_CPU_ARM_MEDIA_FLAG;
       }
 # endif
     }
@@ -156,18 +157,26 @@ int opus_select_arch(void)
   opus_uint32 flags = opus_cpu_capabilities();
   int arch = 0;
 
-  if(!(flags & OPUS_CPU_ARM_EDSP))
+  if(!(flags & OPUS_CPU_ARM_EDSP_FLAG)) {
+    /* Asserts ensure arch values are sequential */
+    celt_assert(arch == OPUS_ARCH_ARM_V4);
     return arch;
+  }
   arch++;
 
-  if(!(flags & OPUS_CPU_ARM_MEDIA))
+  if(!(flags & OPUS_CPU_ARM_MEDIA_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_EDSP);
     return arch;
+  }
   arch++;
 
-  if(!(flags & OPUS_CPU_ARM_NEON))
+  if(!(flags & OPUS_CPU_ARM_NEON_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_MEDIA);
     return arch;
+  }
   arch++;
 
+  celt_assert(arch == OPUS_ARCH_ARM_NEON);
   return arch;
 }
 
diff --git a/celt/arm/armcpu.h b/celt/arm/armcpu.h
index ac57446..820262f 100644
--- a/celt/arm/armcpu.h
+++ b/celt/arm/armcpu.h
@@ -66,6 +66,12 @@
 
 # if defined(OPUS_HAVE_RTCD)
 int opus_select_arch(void);
+
+#define OPUS_ARCH_ARM_V4    (0)
+#define OPUS_ARCH_ARM_EDSP  (1)
+#define OPUS_ARCH_ARM_MEDIA (2)
+#define OPUS_ARCH_ARM_NEON  (3)
+
 # endif
 
 #endif
-- 
2.5.4 (Apple Git-61)

Jonathan Lennox

2015-Dec-23 00:21 UTC

head link

[opus] [AArch64 neon intrinsics v4 2/5] Add Neon intrinsics for Silk noise shape quantization.

---
 Makefile.am         |  5 +--
 silk/NSQ.c          | 39 ++++++++-------------
 silk/NSQ.h          | 70 +++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.c | 69 +++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 silk/x86/NSQ_sse.c  |  2 +-
 silk/x86/main_sse.h |  3 +-
 silk_headers.mk     |  2 ++
 silk_sources.mk     |  2 ++
 9 files changed, 263 insertions(+), 28 deletions(-)
 create mode 100644 silk/NSQ.h
 create mode 100644 silk/arm/NSQ_neon.c
 create mode 100644 silk/arm/NSQ_neon.h

diff --git a/Makefile.am b/Makefile.am
index d256b45..36762c2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -49,6 +49,7 @@ SILK_SOURCES += $(SILK_SOURCES_ARM)
 
 if HAVE_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
+SILK_SOURCES += $(SILK_SOURCES_ARM_NEON_INTR)
 endif
 
 if HAVE_ARM_NE10
@@ -295,7 +296,7 @@ $(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_X86_SSE4_1_CFLAGS)
 endif
 
 if HAVE_ARM_NEON_INTR
-CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo)
-$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \
+ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo)
$(SILK_SOURCES_ARM_NEON_INTR:.c=.lo)
+$(ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \
  $(OPUS_ARM_NEON_INTR_CFLAGS)  $(NE10_CFLAGS)
 endif
diff --git a/silk/NSQ.c b/silk/NSQ.c
index a065884..eff0224 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -31,6 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main.h"
 #include "stack_alloc.h"
+#include "NSQ.h"
+
 
 static OPUS_INLINE void silk_nsq_scale_states(
     const silk_encoder_state *psEncC,           /* I    Encoder State          
*/
@@ -66,7 +68,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
     opus_int            offset_Q10,             /* I                           
*/
     opus_int            length,                 /* I    Input length           
*/
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter
order   */
-    opus_int            predictLPCOrder         /* I    Prediction filter order
*/
+    opus_int            predictLPCOrder,        /* I    Prediction filter order
*/
+    int                 arch                    /* I    Architecture           
*/
 );
 #endif
 
@@ -155,7 +158,7 @@ void silk_NSQ_c
 
         silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10,
pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
             AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[
k ], Gains_Q16[ k ], Lambda_Q10,
-            offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder );
+            offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder, psEncC->arch );
 
         x_Q3   += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
@@ -198,7 +201,8 @@ void silk_noise_shape_quantizer(
     opus_int            offset_Q10,             /* I                           
*/
     opus_int            length,                 /* I    Input length           
*/
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter
order   */
-    opus_int            predictLPCOrder         /* I    Prediction filter order
*/
+    opus_int            predictLPCOrder,        /* I    Prediction filter order
*/
+    int                 arch                    /* I    Architecture           
*/
 )
 {
     opus_int     i, j;
@@ -207,6 +211,9 @@ void silk_noise_shape_quantizer(
     opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
+#ifdef silk_short_prediction_create_arch_coef
+    opus_int32   a_Q12_arch[MAX_LPC_ORDER];
+#endif
 
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag +
HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -215,32 +222,16 @@ void silk_noise_shape_quantizer(
     /* Set up short term AR state */
     psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
 
+#ifdef silk_short_prediction_create_arch_coef
+    silk_short_prediction_create_arch_coef(a_Q12_arch, a_Q12, predictLPCOrder);
+#endif
+
     for( i = 0; i < length; i++ ) {
         /* Generate dither */
         NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
 
         /* Short-term prediction */
-        silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
-        /* Avoids introducing a bias because silk_SMLAWB() always rounds to
-inf */
-        LPC_pred_Q10 = silk_RSHIFT( predictLPCOrder, 1 );
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[  0 ], a_Q12[ 0 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -1 ], a_Q12[ 1 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -2 ], a_Q12[ 2 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -3 ], a_Q12[ 3 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -4 ], a_Q12[ 4 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -5 ], a_Q12[ 5 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -6 ], a_Q12[ 6 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -7 ], a_Q12[ 7 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -8 ], a_Q12[ 8 ]
);
-        LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -9 ], a_Q12[ 9 ]
);
-        if( predictLPCOrder == 16 ) {
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -10 ], a_Q12[
10 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -11 ], a_Q12[
11 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -12 ], a_Q12[
12 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -13 ], a_Q12[
13 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -14 ], a_Q12[
14 ] );
-            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, psLPC_Q14[ -15 ], a_Q12[
15 ] );
-        }
+        LPC_pred_Q10 = silk_noise_shape_quantizer_short_prediction(psLPC_Q14,
a_Q12, a_Q12_arch, predictLPCOrder, arch);
 
         /* Long-term prediction */
         if( signalType == TYPE_VOICED ) {
diff --git a/silk/NSQ.h b/silk/NSQ.h
new file mode 100644
index 0000000..c71e493
--- /dev/null
+++ b/silk/NSQ.h
@@ -0,0 +1,70 @@
+/***********************************************************************
+Copyright (c) 2014 Vidyo.
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifndef SILK_NSQ_H
+#define SILK_NSQ_H
+
+#undef silk_short_prediction_create_arch_coef
+
+static OPUS_INLINE opus_int32
silk_noise_shape_quantizer_short_prediction_c(const opus_int32 *buf32, const
opus_int16 *coef16, opus_int order)
+{
+    opus_int32 out;
+    silk_assert( order == 10 || order == 16 );
+
+    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+    out = silk_RSHIFT( order, 1 );
+    out = silk_SMLAWB( out, buf32[  0 ], coef16[ 0 ] );
+    out = silk_SMLAWB( out, buf32[ -1 ], coef16[ 1 ] );
+    out = silk_SMLAWB( out, buf32[ -2 ], coef16[ 2 ] );
+    out = silk_SMLAWB( out, buf32[ -3 ], coef16[ 3 ] );
+    out = silk_SMLAWB( out, buf32[ -4 ], coef16[ 4 ] );
+    out = silk_SMLAWB( out, buf32[ -5 ], coef16[ 5 ] );
+    out = silk_SMLAWB( out, buf32[ -6 ], coef16[ 6 ] );
+    out = silk_SMLAWB( out, buf32[ -7 ], coef16[ 7 ] );
+    out = silk_SMLAWB( out, buf32[ -8 ], coef16[ 8 ] );
+    out = silk_SMLAWB( out, buf32[ -9 ], coef16[ 9 ] );
+
+    if( order == 16 )
+    {
+        out = silk_SMLAWB( out, buf32[ -10 ], coef16[ 10 ] );
+        out = silk_SMLAWB( out, buf32[ -11 ], coef16[ 11 ] );
+        out = silk_SMLAWB( out, buf32[ -12 ], coef16[ 12 ] );
+        out = silk_SMLAWB( out, buf32[ -13 ], coef16[ 13 ] );
+        out = silk_SMLAWB( out, buf32[ -14 ], coef16[ 14 ] );
+        out = silk_SMLAWB( out, buf32[ -15 ], coef16[ 15 ] );
+    }
+    return out;
+}
+
+#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  ((void)arch,silk_noise_shape_quantizer_short_prediction_c(in, coef,
order))
+
+
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#include "arm/NSQ_neon.h"
+#endif
+
+#endif /* SILK_NSQ_H */
diff --git a/silk/arm/NSQ_neon.c b/silk/arm/NSQ_neon.c
new file mode 100644
index 0000000..4344b37
--- /dev/null
+++ b/silk/arm/NSQ_neon.c
@@ -0,0 +1,69 @@
+/***********************************************************************
+Copyright (C) 2014 Vidyo
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "main.h"
+#include "stack_alloc.h"
+#include "NSQ.h"
+#include "celt/cpu_support.h"
+#include "celt/arm/armcpu.h"
+
+opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32
*buf32, const opus_int32 *coef32, opus_int order)
+{
+    int32x4_t coef0 = vld1q_s32(coef32);
+    int32x4_t coef1 = vld1q_s32(coef32 + 4);
+    int32x4_t coef2 = vld1q_s32(coef32 + 8);
+    int32x4_t coef3 = vld1q_s32(coef32 + 12);
+
+    int32x4_t a0 = vld1q_s32(buf32 - 15);
+    int32x4_t a1 = vld1q_s32(buf32 - 11);
+    int32x4_t a2 = vld1q_s32(buf32 - 7);
+    int32x4_t a3 = vld1q_s32(buf32 - 3);
+
+    int32x4_t b0 = vqdmulhq_s32(coef0, a0);
+    int32x4_t b1 = vqdmulhq_s32(coef1, a1);
+    int32x4_t b2 = vqdmulhq_s32(coef2, a2);
+    int32x4_t b3 = vqdmulhq_s32(coef3, a3);
+
+    int32x4_t c0 = vaddq_s32(b0, b1);
+    int32x4_t c1 = vaddq_s32(b2, b3);
+
+    int32x4_t d = vaddq_s32(c0, c1);
+
+    int64x2_t e = vpaddlq_s32(d);
+
+    int64x1_t f = vadd_s64(vget_low_s64(e), vget_high_s64(e));
+
+    opus_int32 out = vget_lane_s32(vreinterpret_s32_s64(f), 0);
+
+    out += silk_RSHIFT( order, 1 );
+
+    return out;
+}
diff --git a/silk/arm/NSQ_neon.h b/silk/arm/NSQ_neon.h
new file mode 100644
index 0000000..56f9ead
--- /dev/null
+++ b/silk/arm/NSQ_neon.h
@@ -0,0 +1,99 @@
+/***********************************************************************
+Copyright (C) 2014 Vidyo
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifndef SILK_NSQ_NEON_H
+#define SILK_NSQ_NEON_H
+
+#include "config.h"
+
+#undef silk_short_prediction_create_arch_coef
+/* For vectorized calc, reverse a_Q12 coefs, convert to 32-bit, and shift for
vqdmulhq_s32. */
+static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon(opus_int32
*out, const opus_int16 *in, opus_int order)
+{
+    out[15] = in[0] << 15;
+    out[14] = in[1] << 15;
+    out[13] = in[2] << 15;
+    out[12] = in[3] << 15;
+    out[11] = in[4] << 15;
+    out[10] = in[5] << 15;
+    out[9]  = in[6] << 15;
+    out[8]  = in[7] << 15;
+    out[7]  = in[8] << 15;
+    out[6]  = in[9] << 15;
+
+    if (order == 16)
+    {
+        out[5] = in[10] << 15;
+        out[4] = in[11] << 15;
+        out[3] = in[12] << 15;
+        out[2] = in[13] << 15;
+        out[1] = in[14] << 15;
+        out[0] = in[15] << 15;
+    }
+    else
+    {
+        out[5] = 0;
+        out[4] = 0;
+        out[3] = 0;
+        out[2] = 0;
+        out[1] = 0;
+        out[0] = 0;
+    }
+}
+
+#if OPUS_ARM_PRESUME_NEON_INTR
+
+#define silk_short_prediction_create_arch_coef(out, in, order) \
+    (silk_short_prediction_create_arch_coef_neon(out, in, order))
+
+#elif OPUS_HAVE_RTCD && OPUS_ARM_MAY_HAVE_NEON_INTR
+
+#define silk_short_prediction_create_arch_coef(out, in, order) \
+    do { if (arch == OPUS_ARCH_ARM_NEON) {
silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0)
+
+#endif
+
+opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32
*buf32, const opus_int32 *coef32, opus_int order);
+
+#if OPUS_ARM_PRESUME_NEON_INTR
+#undef silk_noise_shape_quantizer_short_prediction
+#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch) \
+    ((void)arch,silk_noise_shape_quantizer_short_prediction_neon(in, coefRev,
order))
+
+#elif OPUS_HAVE_RTCD && OPUS_ARM_MAY_HAVE_NEON_INTR
+
+/* silk_noise_shape_quantizer_short_prediction implementations take different
parameters based on arch
+   (coef vs. coefRev) so can't use the usual IMPL table implementation */
+#undef silk_noise_shape_quantizer_short_prediction
+#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  \
+    (arch == OPUS_ARCH_ARM_NEON ? \
+        silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) :
\
+        silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
+
+
+#endif
+
+#endif /* SILK_NSQ_NEON_H */
diff --git a/silk/x86/NSQ_sse.c b/silk/x86/NSQ_sse.c
index 72f34fd..bb3c5f1 100644
--- a/silk/x86/NSQ_sse.c
+++ b/silk/x86/NSQ_sse.c
@@ -221,7 +221,7 @@ void silk_NSQ_sse4_1(
         {
             silk_noise_shape_quantizer( NSQ, psIndices->signalType,
x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
                 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ],
LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
-                offset_Q10, psEncC->subfr_length,
psEncC->shapingLPCOrder, psEncC->predictLPCOrder );
+                offset_Q10, psEncC->subfr_length,
psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
         }
 
         x_Q3   += psEncC->subfr_length;
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index afd5ec2..d8d6131 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -207,7 +207,8 @@ void silk_noise_shape_quantizer(
     opus_int            offset_Q10,             /* I                           
*/
     opus_int            length,                 /* I    Input length           
*/
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter
order   */
-    opus_int            predictLPCOrder         /* I    Prediction filter order
*/
+    opus_int            predictLPCOrder,        /* I    Prediction filter order
*/
+    int                 arch                    /* I    Architecture           
*/
 );
 
 /**************************/
diff --git a/silk_headers.mk b/silk_headers.mk
index 679ff8f..c74ab81 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -15,6 +15,7 @@ silk/Inlines.h \
 silk/MacroCount.h \
 silk/MacroDebug.h \
 silk/macros.h \
+silk/NSQ.h \
 silk/pitch_est_defines.h \
 silk/resampler_private.h \
 silk/resampler_rom.h \
@@ -25,6 +26,7 @@ silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
 silk/arm/SigProc_FIX_armv4.h \
 silk/arm/SigProc_FIX_armv5e.h \
+silk/arm/NSQ_neon.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
 silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 7cfb7d3..79ac6f0 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -82,6 +82,8 @@ silk/x86/x86_silk_map.c \
 silk/x86/VAD_sse.c \
 silk/x86/VQ_WMat_EC_sse.c
 
+SILK_SOURCES_ARM_NEON_INTR = silk/arm/NSQ_neon.c
+
 SILK_SOURCES_FIXED = \
 silk/fixed/LTP_analysis_filter_FIX.c \
 silk/fixed/LTP_scale_ctrl_FIX.c \
-- 
2.5.4 (Apple Git-61)

Jonathan Lennox

2015-Dec-23 00:21 UTC

head link

[opus] [AArch64 neon intrinsics v4 3/5] Apply Neon short prediction optimization to silk_noise_shape_quantizer_del_dec.

---
 silk/NSQ_del_dec.c             | 40 ++++++++++++++++------------------------
 silk/mips/NSQ_del_dec_mipsr1.h |  3 ++-
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index aff560c..ab6feea 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -31,6 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main.h"
 #include "stack_alloc.h"
+#include "NSQ.h"
+
 
 typedef struct {
     opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
@@ -106,7 +108,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     opus_int            warping_Q16,            /* I                           
*/
     opus_int            nStatesDelayedDecision, /* I    Number of states in
decision tree   */
     opus_int            *smpl_buf_idx,          /* I    Index to newest samples
in buffers  */
-    opus_int            decisionDelay           /* I                           
*/
+    opus_int            decisionDelay,          /* I                           
*/
+    int                 arch                    /* I                           
*/
 );
 
 void silk_NSQ_del_dec_c(
@@ -260,7 +263,7 @@ void silk_NSQ_del_dec_c(
         silk_noise_shape_quantizer_del_dec( NSQ, psDelDec,
psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
             delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag,
HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length,
subfr++, psEncC->shapingLPCOrder,
-            psEncC->predictLPCOrder, psEncC->warping_Q16,
psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
+            psEncC->predictLPCOrder, psEncC->warping_Q16,
psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay,
psEncC->arch );
 
         x_Q3   += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
@@ -333,7 +336,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     opus_int            warping_Q16,            /* I                           
*/
     opus_int            nStatesDelayedDecision, /* I    Number of states in
decision tree   */
     opus_int            *smpl_buf_idx,          /* I    Index to newest samples
in buffers  */
-    opus_int            decisionDelay           /* I                           
*/
+    opus_int            decisionDelay,          /* I                           
*/
+    int                 arch                    /* I                           
*/
 )
 {
     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
@@ -343,6 +347,10 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+#ifdef silk_short_prediction_create_arch_coef
+    opus_int32   a_Q12_arch[MAX_LPC_ORDER];
+#endif
+
     VARDECL( NSQ_sample_pair, psSampleState );
     NSQ_del_dec_struct *psDD;
     NSQ_sample_struct  *psSS;
@@ -355,6 +363,10 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
 
+#ifdef silk_short_prediction_create_arch_coef
+    silk_short_prediction_create_arch_coef(a_Q12_arch, a_Q12, predictLPCOrder);
+#endif
+
     for( i = 0; i < length; i++ ) {
         /* Perform common calculations used in all states */
 
@@ -398,27 +410,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
             /* Pointer used in short term prediction and shaping */
             psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
             /* Short-term prediction */
-            silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
-            /* Avoids introducing a bias because silk_SMLAWB() always rounds to
-inf */
-            LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[  0 ], a_Q12[ 0
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -1 ], a_Q12[ 1
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -2 ], a_Q12[ 2
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -3 ], a_Q12[ 3
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -4 ], a_Q12[ 4
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -5 ], a_Q12[ 5
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -6 ], a_Q12[ 6
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -7 ], a_Q12[ 7
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8
] );
-            LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9
] );
-            if( predictLPCOrder == 16 ) {
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -10 ],
a_Q12[ 10 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -11 ],
a_Q12[ 11 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -12 ],
a_Q12[ 12 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -13 ],
a_Q12[ 13 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -14 ],
a_Q12[ 14 ] );
-                LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -15 ],
a_Q12[ 15 ] );
-            }
+            LPC_pred_Q14 =
silk_noise_shape_quantizer_short_prediction(psLPC_Q14, a_Q12, a_Q12_arch,
predictLPCOrder, arch);
             LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 );                     
/* Q10 -> Q14 */
 
             /* Noise shape feedback */
diff --git a/silk/mips/NSQ_del_dec_mipsr1.h b/silk/mips/NSQ_del_dec_mipsr1.h
index f6afd92..88e281b 100644
--- a/silk/mips/NSQ_del_dec_mipsr1.h
+++ b/silk/mips/NSQ_del_dec_mipsr1.h
@@ -62,7 +62,8 @@ static inline void silk_noise_shape_quantizer_del_dec(
     opus_int            warping_Q16,            /* I                           
*/
     opus_int            nStatesDelayedDecision, /* I    Number of states in
decision tree   */
     opus_int            *smpl_buf_idx,          /* I    Index to newest samples
in buffers  */
-    opus_int            decisionDelay           /* I                           
*/
+    opus_int            decisionDelay,          /* I                           
*/
+    int                 arch                    /* I                           
*/
 )
 {
     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
-- 
2.5.4 (Apple Git-61)

Jonathan Lennox

2015-Dec-23 00:21 UTC

head link

[opus] [AArch64 neon intrinsics v4 4/5] Add Neon intrinsics for Silk noise shape feedback loop.

---
 silk/NSQ.c          | 18 ++-------------
 silk/NSQ.h          | 27 ++++++++++++++++++++++
 silk/arm/NSQ_neon.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 silk/arm/NSQ_neon.h | 10 ++++++++
 4 files changed, 105 insertions(+), 16 deletions(-)

diff --git a/silk/NSQ.c b/silk/NSQ.c
index eff0224..3bff8ba 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -205,7 +205,7 @@ void silk_noise_shape_quantizer(
     int                 arch                    /* I    Architecture           
*/
 )
 {
-    opus_int     i, j;
+    opus_int     i;
     opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
     opus_int32   n_LF_Q12, r_Q10, rr_Q10, q1_Q0, q1_Q10, q2_Q10, rd1_Q20,
rd2_Q20;
     opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
@@ -250,21 +250,7 @@ void silk_noise_shape_quantizer(
 
         /* Noise shape feedback */
         silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order
is even */
-        tmp2 = psLPC_Q14[ 0 ];
-        tmp1 = NSQ->sAR2_Q14[ 0 ];
-        NSQ->sAR2_Q14[ 0 ] = tmp2;
-        n_AR_Q12 = silk_RSHIFT( shapingLPCOrder, 1 );
-        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp2, AR_shp_Q13[ 0 ] );
-        for( j = 2; j < shapingLPCOrder; j += 2 ) {
-            tmp2 = NSQ->sAR2_Q14[ j - 1 ];
-            NSQ->sAR2_Q14[ j - 1 ] = tmp1;
-            n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp1, AR_shp_Q13[ j - 1 ] );
-            tmp1 = NSQ->sAR2_Q14[ j + 0 ];
-            NSQ->sAR2_Q14[ j + 0 ] = tmp2;
-            n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp2, AR_shp_Q13[ j ] );
-        }
-        NSQ->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
-        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, tmp1, AR_shp_Q13[ shapingLPCOrder - 1
] );
+        n_AR_Q12 = silk_NSQ_noise_shape_feedback_loop(psLPC_Q14,
NSQ->sAR2_Q14, AR_shp_Q13, shapingLPCOrder, arch);
 
         n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                               
/* Q11 -> Q12 */
         n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sLF_AR_shp_Q14, Tilt_Q14 );
diff --git a/silk/NSQ.h b/silk/NSQ.h
index c71e493..052d531 100644
--- a/silk/NSQ.h
+++ b/silk/NSQ.h
@@ -62,6 +62,33 @@ static OPUS_INLINE opus_int32
silk_noise_shape_quantizer_short_prediction_c(cons
 
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch)  ((void)arch,silk_noise_shape_quantizer_short_prediction_c(in, coef,
order))
 
+static OPUS_INLINE opus_int32 silk_NSQ_noise_shape_feedback_loop_c(const
opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef, opus_int order)
+{
+    opus_int32 out;
+    opus_int32 tmp1, tmp2;
+    opus_int j;
+
+    tmp2 = data0[0];
+    tmp1 = data1[0];
+    data1[0] = tmp2;
+
+    out = silk_RSHIFT(order, 1);
+    out = silk_SMLAWB(out, tmp2, coef[0]);
+
+    for (j = 2; j < order; j += 2) {
+        tmp2 = data1[j - 1];
+        data1[j - 1] = tmp1;
+        out = silk_SMLAWB(out, tmp1, coef[j - 1]);
+        tmp1 = data1[j + 0];
+        data1[j + 0] = tmp2;
+        out = silk_SMLAWB(out, tmp2, coef[j]);
+    }
+    data1[order - 1] = tmp1;
+    out = silk_SMLAWB(out, tmp1, coef[order - 1]);
+    return out;
+}
+
+#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) 
((void)arch,silk_NSQ_noise_shape_feedback_loop_c(data0, data1, coef, order))
 
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #include "arm/NSQ_neon.h"
diff --git a/silk/arm/NSQ_neon.c b/silk/arm/NSQ_neon.c
index 4344b37..d9deb44 100644
--- a/silk/arm/NSQ_neon.c
+++ b/silk/arm/NSQ_neon.c
@@ -67,3 +67,69 @@ opus_int32
silk_noise_shape_quantizer_short_prediction_neon(const opus_int32 *bu
 
     return out;
 }
+
+
+opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0,
opus_int32 *data1, const opus_int16 *coef, opus_int order)
+{
+    opus_int32 out;
+    if (order == 8)
+    {
+        int32x4_t a00 = vdupq_n_s32(data0[0]);
+        int32x4_t a01 = vld1q_s32(data1);  /* data1[0] ... [3] */
+
+        int32x4_t a0 = vextq_s32 (a00, a01, 3); /* data0[0] data1[0] ...[2] */
+        int32x4_t a1 = vld1q_s32(data1 + 3);  /* data1[3] ... [6] */
+
+        int16x8_t coef16 = vld1q_s16(coef);
+        int32x4_t coef0 = vmovl_s16(vget_low_s16(coef16));
+        int32x4_t coef1 = vmovl_s16(vget_high_s16(coef16));
+
+        int64x2_t b0 = vmull_s32(vget_low_s32(a0), vget_low_s32(coef0));
+        int64x2_t b1 = vmlal_s32(b0, vget_high_s32(a0), vget_high_s32(coef0));
+        int64x2_t b2 = vmlal_s32(b1, vget_low_s32(a1), vget_low_s32(coef1));
+        int64x2_t b3 = vmlal_s32(b2, vget_high_s32(a1), vget_high_s32(coef1));
+
+        int64x1_t c = vadd_s64(vget_low_s64(b3), vget_high_s64(b3));
+        int64x1_t cS = vshr_n_s64(c, 16);
+        int32x2_t d = vreinterpret_s32_s64(cS);
+
+        out = vget_lane_s32(d, 0);
+        vst1q_s32(data1, a0);
+        vst1q_s32(data1 + 4, a1);
+    }
+    else
+    {
+        opus_int32 tmp1, tmp2;
+        opus_int j;
+
+        tmp2 = data0[0];
+        tmp1 = data1[0];
+        data1[0] = tmp2;
+
+        out = silk_RSHIFT(order, 1);
+        out = silk_SMLAWB(out, tmp2, coef[0]);
+
+        for (j = 2; j < order; j += 2) {
+            tmp2 = data1[j - 1];
+            data1[j - 1] = tmp1;
+            out = silk_SMLAWB(out, tmp1, coef[j - 1]);
+            tmp1 = data1[j + 0];
+            data1[j + 0] = tmp2;
+            out = silk_SMLAWB(out, tmp2, coef[j]);
+        }
+        data1[order - 1] = tmp1;
+        out = silk_SMLAWB(out, tmp1, coef[order - 1]);
+    }
+    return out;
+}
+
+#if !defined(OPUS_ARM_PRESUME_NEON_INTR) && defined(OPUS_HAVE_RTCD)
+
+opus_int32 (*const
SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL[OPUS_ARCHMASK+1])(const opus_int32
*data0, opus_int32 *data1, const opus_int16 *coef, opus_int order) = {
+    silk_NSQ_noise_shape_feedback_loop_c,
+    silk_NSQ_noise_shape_feedback_loop_c,
+    silk_NSQ_noise_shape_feedback_loop_c,
+    silk_NSQ_noise_shape_feedback_loop_neon,
+};
+
+#endif
diff --git a/silk/arm/NSQ_neon.h b/silk/arm/NSQ_neon.h
index 56f9ead..2e6e491 100644
--- a/silk/arm/NSQ_neon.h
+++ b/silk/arm/NSQ_neon.h
@@ -78,11 +78,16 @@ static OPUS_INLINE void
silk_short_prediction_create_arch_coef_neon(opus_int32 *
 
 opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32
*buf32, const opus_int32 *coef32, opus_int order);
 
+opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0,
opus_int32 *data1, const opus_int16 *coef, opus_int order);
+
 #if OPUS_ARM_PRESUME_NEON_INTR
 #undef silk_noise_shape_quantizer_short_prediction
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order,
arch) \
     ((void)arch,silk_noise_shape_quantizer_short_prediction_neon(in, coefRev,
order))
 
+#undef silk_NSQ_noise_shape_feedback_loop
+#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) 
((void)arch,silk_NSQ_noise_shape_feedback_loop_neon(data0, data1, coef, order))
+
 #elif OPUS_HAVE_RTCD && OPUS_ARM_MAY_HAVE_NEON_INTR
 
 /* silk_noise_shape_quantizer_short_prediction implementations take different
parameters based on arch
@@ -93,6 +98,11 @@ opus_int32
silk_noise_shape_quantizer_short_prediction_neon(const opus_int32 *bu
         silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) :
\
         silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
 
+extern opus_int32 (*const
SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL[OPUS_ARCHMASK+1])(const opus_int32
*data0, opus_int32 *data1, const opus_int16 *coef, opus_int order);
+
+#undef silk_NSQ_noise_shape_feedback_loop
+#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) 
(SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL[(arch)&OPUS_ARCHMASK](data0,
data1, coef, order))
+
 
 #endif
 
-- 
2.5.4 (Apple Git-61)

Jonathan Lennox

2015-Dec-23 00:21 UTC

head link

[opus] [AArch64 neon intrinsics v4 5/5] Add Neon fixed-point implementation of xcorr_kernel.

Used for celt_pitch_xcorr on aarch64, and celt_fir and celt_iir on both armv7
and aarch64.
---
 celt/arm/arm_celt_map.c   | 17 +++++++++++++
 celt/arm/celt_neon_intr.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++-
 celt/arm/pitch_arm.h      | 31 +++++++++++++++++++++++-
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 85a48e8..0d002b9 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -60,6 +60,23 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const
opus_val16 *,
 #  endif
 # endif /* FIXED_POINT */
 
+#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
+	defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         opus_val32       sum[4],
+         int              len
+) = {
+  xcorr_kernel_c,                /* ARMv4 */
+  xcorr_kernel_c,                /* EDSP */
+  xcorr_kernel_c,                /* Media */
+  xcorr_kernel_neon_fixed,       /* Neon */
+};
+
+#endif
+
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  if defined(HAVE_ARM_NE10)
 #   if defined(CUSTOM_MODES)
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index 47dce15..e721418 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -37,7 +37,66 @@
 #include <arm_neon.h>
 #include "../pitch.h"
 
-#if !defined(FIXED_POINT)
+#if defined(FIXED_POINT)
+void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y,
opus_val32 sum[4], int len)
+{
+	int j;
+	int32x4_t a = vld1q_s32(sum);
+	/* Load y[0...3] */
+	/* This requires len>0 to always be valid (which we assert in the C code).
*/
+	int16x4_t y0 = vld1_s16(y);
+	y += 4;
+
+	for (j = 0; j + 8 <= len; j += 8)
+	{
+		/* Load x[0...7] */
+		int16x8_t xx = vld1q_s16(x);
+		int16x4_t x0 = vget_low_s16(xx);
+		int16x4_t x4 = vget_high_s16(xx);
+		/* Load y[4...11] */
+		int16x8_t yy = vld1q_s16(y);
+		int16x4_t y4 = vget_low_s16(yy);
+		int16x4_t y8 = vget_high_s16(yy);
+		int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+		int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
+
+		int16x4_t y1 = vext_s16(y0, y4, 1);
+		int16x4_t y5 = vext_s16(y4, y8, 1);
+		int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
+		int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
+
+		int16x4_t y2 = vext_s16(y0, y4, 2);
+		int16x4_t y6 = vext_s16(y4, y8, 2);
+		int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
+		int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
+
+		int16x4_t y3 = vext_s16(y0, y4, 3);
+		int16x4_t y7 = vext_s16(y4, y8, 3);
+		int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
+		int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
+
+		y0 = y8;
+		a = a7;
+		x += 8;
+		y += 8;
+	}
+
+	for (; j < len; j++)
+	{
+		int16x4_t x0 = vld1_dup_s16(x);  /* load next x */
+		int32x4_t a0 = vmlal_s16(a, y0, x0);
+
+		int16x4_t y4 = vld1_dup_s16(y);  /* load next y */
+		y0 = vext_s16(y0, y4, 1);
+		a = a0;
+		x++;
+		y++;
+	}
+
+	vst1q_s32(sum, a);
+}
+
+#else
 /*
  * Function: xcorr_kernel_neon_float
  * ---------------------------------
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index 2264f71..6f4e2b6 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -64,7 +64,36 @@ extern opus_val32
 #   define OVERRIDE_PITCH_XCORR (1)
 #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
-#  endif
+
+#endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+
+void xcorr_kernel_neon_fixed(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+
+#   define OVERRIDE_XCORR_KERNEL (1)
+
+#   if defined(OPUS_ARM_PRESUME_NEON_INTR) || !defined(OPUS_HAVE_RTCD)
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len))
+#   else /* Start !OPUS_ARM_PRESUME_NEON_INTR */
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
+
+
+#   endif /* end !OPUS_ARM_PRESUME_NEON_INTR */
+#  endif /* end OPUS_ARM_MAY_HAVE_NEON_INTR */
 
 #else /* Start !FIXED_POINT */
 /* Float case */
-- 
2.5.4 (Apple Git-61)

Timothy B. Terriberry

2016-Jul-06 22:35 UTC

head link

[opus] [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset

Jonathan Lennox wrote:> Following Tim's comments, here are my reworked patches for the Neon
intrinsic function patches of
After far too long, I've finally landed these patches (including the 
others from the earlier series), with a few changes (mostly implemented 
myself on a long plane flight in the name of expediency):

- Removed all tabs (including those from prior commits merged by Jean-Marc).
- Marked arch unused in the MIPS version of 
silk_noise_shape_quantizer_del_dec().
- Added #include "SigProc_FIX.h" to NSQ.h to get a definition of 
OPUS_INLINE as well as opus_int32, opus_int, silk_assert(), 
silk_RSHIFT(), etc.
- Added #include "cpu_support.h" to NSQ_neon.h to get a definition of 
OPUS_ARCHMASK.
- Removed #include "config.h" from NSQ_neon.h: this should be done
from
each .c file (as is the pattern everywhere else).
- Moved SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_NEON_IMPL into an 
arm_silk_map.c (and removed the _NEON tag). If we ever get versions of 
these functions for older ARM arches, they have to go in separate files 
(so we can pass them separate C flags), so putting it in the same 
compilation unit as the NEON version is the wrong place. Also, if we 
ever update the architecture list, we don't want to have to go hunting 
all over the source code for these tables, so all of the SILK ones 
should live in the same place (if we ever get any more).
- Made silk_NSQ_noise_shape_feedback_loop() directly return a Q12 
result, instead of having the caller convert from Q11 to Q12. This saves 
an instruction in the NEON version.
- Added some comments to silk_NSQ_noise_shape_feedback_loop_neon() about 
some repeated conversions we could eliminate and the non-bit-exactness 
w.r.t. the C version.
- Made the final right-shift in 
silk_NSQ_noise_shape_feedback_loop_neon() apply a rounding offset (in 
place of the bias that was in the C version), since it was free.
- Made the fallback in silk_NSQ_noise_shape_feedback_loop_neon() for 
orders other than 8 directly invoke the C version instead of duplicating 
the code.
- Fixed the #ifdef logic for xcorr_kernel_neon_fixed to match that of 
celt_pitch_xcorr_float_neon() (i.e., if we somehow get MAY_HAVE_NEON but 
not PRESUME_NEON and not HAVE_RTCD, don't force invoking the NEON version).
- Rebased the OPUS_FAST_INT64 changes (the way this was defined changed 
in February).

This also included the fix to the configure output Jonathan sent to the 
list on June 30th.

Maybe Matching Threads

Search for more maybe matching threads

opus - Dec 2015 - [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset

[opus] [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset

[opus] [AArch64 neon intrinsics v4 1/5] Add named constants for ARM architecture variants.

[opus] [AArch64 neon intrinsics v4 2/5] Add Neon intrinsics for Silk noise shape quantization.

[opus] [AArch64 neon intrinsics v4 3/5] Apply Neon short prediction optimization to silk_noise_shape_quantizer_del_dec.

[opus] [AArch64 neon intrinsics v4 4/5] Add Neon intrinsics for Silk noise shape feedback loop.

[opus] [AArch64 neon intrinsics v4 5/5] Add Neon fixed-point implementation of xcorr_kernel.

[opus] [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset

Maybe Matching Threads