NSQ_LPC_BUF_LENGTH is independent of DECISION_DELAY. --- silk/define.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/silk/define.h b/silk/define.h index 781cfdc..1286048 100644 --- a/silk/define.h +++ b/silk/define.h @@ -173,11 +173,7 @@ extern "C" #define MAX_MATRIX_SIZE MAX_LPC_ORDER /* Max of LPC Order and LTP order */ -#if( MAX_LPC_ORDER > DECISION_DELAY ) # define NSQ_LPC_BUF_LENGTH MAX_LPC_ORDER -#else -# define NSQ_LPC_BUF_LENGTH DECISION_DELAY -#endif /***************************/ /* Voice activity detector */ -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Aug-23 16:10 UTC
[opus] [PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
Created corresponding unit test, and the optimization is bit exact with C function. This optimization speeds up SILK encoder on NEON as following. Fixed-point: Complexity 0-5: 0% Complexity 6-7: 6% Complexity 8-9: 10% Complexity 10: 8% Got similar results on floating-point. --- silk/NSQ_del_dec.c | 6 +- silk/SigProc_FIX.h | 4 +- silk/arm/NSQ_del_dec_arm.h | 88 ++ silk/arm/NSQ_del_dec_neon_intr.c | 1125 +++++++++++++++++++++++ silk/arm/arm_silk_map.c | 23 + silk/main.h | 6 +- silk/mips/NSQ_del_dec_mipsr1.h | 2 +- silk/tests/test_unit_optimization_NSQ_del_dec.c | 142 +++ silk/x86/NSQ_del_dec_sse.c | 6 +- silk/x86/main_sse.h | 4 +- silk/x86/x86_silk_map.c | 2 +- silk_sources.mk | 1 + tests/test_unit_optimization.c | 2 + 13 files changed, 1399 insertions(+), 12 deletions(-) create mode 100644 silk/arm/NSQ_del_dec_arm.h create mode 100644 silk/arm/NSQ_del_dec_neon_intr.c create mode 100644 silk/tests/test_unit_optimization_NSQ_del_dec.c diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c index 3495613..20640a4 100644 --- a/silk/NSQ_del_dec.c +++ b/silk/NSQ_del_dec.c @@ -109,13 +109,13 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( opus_int predictLPCOrder, /* I Prediction filter order */ opus_int warping_Q16, /* I */ opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ - opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ opus_int decisionDelay, /* I */ int arch /* I */ ); void silk_NSQ_del_dec_c( - const silk_encoder_state *psEncC, /* I/O Encoder State */ + const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ @@ -341,7 +341,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( opus_int predictLPCOrder, /* I Prediction filter order */ opus_int warping_Q16, /* I */ opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ - opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ opus_int decisionDelay, /* I */ int arch /* I */ ) diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index 0e619d0..4f0a09e 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -599,7 +599,9 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) /* Make sure to store the result as the seed for the next call (also in between */ /* frames), otherwise result won't be random at all. When only using some of the */ /* bits, take the most significant bits by right-shifting. */ -#define silk_RAND(seed) (silk_MLA_ovflw(907633515, (seed), 196314165)) +#define RAND_MULTIPLIER 196314165 +#define RAND_INCREMENT 907633515 +#define silk_RAND(seed) (silk_MLA_ovflw((RAND_INCREMENT), (seed), (RAND_MULTIPLIER))) /* Add some multiplication functions that can be easily mapped to ARM. */ diff --git a/silk/arm/NSQ_del_dec_arm.h b/silk/arm/NSQ_del_dec_arm.h new file mode 100644 index 0000000..93581e1 --- /dev/null +++ b/silk/arm/NSQ_del_dec_arm.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(NSQ_DEL_DEC_ARM_H) +# define NSQ_DEL_DEC_ARM_H + +# include "celt/arm/armcpu.h" + +# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +void silk_NSQ_del_dec_neon( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_silk_NSQ_del_dec (1) +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),PRESUME_NEON(silk_NSQ_del_dec)(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +# endif + +# if !defined(OVERRIDE_silk_NSQ_del_dec) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK+1])( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +); +# define OVERRIDE_silk_NSQ_del_dec (1) +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((*SILK_NSQ_DEL_DEC_IMPL[(arch)&OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_silk_NSQ_del_dec (1) +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_neon(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +# endif +# endif + +#endif /* end NSQ_DEL_DEC_ARM_H */ diff --git a/silk/arm/NSQ_del_dec_neon_intr.c b/silk/arm/NSQ_del_dec_neon_intr.c new file mode 100644 index 0000000..c541dcd --- /dev/null +++ b/silk/arm/NSQ_del_dec_neon_intr.c @@ -0,0 +1,1125 @@ +/* Copyright (c) 2016 Google Inc. */ +/** + @file NSQ_del_dec_neon_intr.c + @brief ARM Neon Intrinsic optimizations for silk NSQ_del_dec functions + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <arm_neon.h> +#include "main.h" +#include "stack_alloc.h" + +typedef struct { + opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ MAX_DEL_DEC_STATES ]; + opus_int32 RandState[ DECISION_DELAY ][ MAX_DEL_DEC_STATES ]; + opus_int32 Q_Q10[ DECISION_DELAY ][ MAX_DEL_DEC_STATES ]; + opus_int32 Xq_Q14[ DECISION_DELAY ][ MAX_DEL_DEC_STATES ]; + opus_int32 Pred_Q15[ DECISION_DELAY ][ MAX_DEL_DEC_STATES ]; + opus_int32 Shape_Q14[ DECISION_DELAY ][ MAX_DEL_DEC_STATES ]; + opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ MAX_DEL_DEC_STATES ]; + opus_int32 LF_AR_Q14[ MAX_DEL_DEC_STATES ]; + opus_int32 Diff_Q14[ MAX_DEL_DEC_STATES ]; + opus_int32 Seed[ MAX_DEL_DEC_STATES ]; + opus_int32 SeedInit[ MAX_DEL_DEC_STATES ]; + opus_int32 RD_Q10[ MAX_DEL_DEC_STATES ]; +} NSQ_del_decs_struct; + +typedef struct { + opus_int32 Q_Q10[ MAX_DEL_DEC_STATES ]; + opus_int32 RD_Q10[ MAX_DEL_DEC_STATES ]; + opus_int32 xq_Q14[ MAX_DEL_DEC_STATES ]; + opus_int32 LF_AR_Q14[ MAX_DEL_DEC_STATES ]; + opus_int32 Diff_Q14[ MAX_DEL_DEC_STATES ]; + opus_int32 sLTP_shp_Q14[ MAX_DEL_DEC_STATES ]; + opus_int32 LPC_exc_Q14[ MAX_DEL_DEC_STATES ]; +} NSQ_samples_struct; + +static OPUS_INLINE void silk_nsq_del_dec_scale_states( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ + const opus_int16 x16[], /* I Input */ + opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +); + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ + opus_int decisionDelay /* I */ +); + +/* This table records ((index + DECISION_DELAY - 1) % DECISION_DELAY) to avoid the division operation when DECISION_DELAY is not a power of 2. + * This table must be updated if DECISION_DELAY changes. + */ +static const opus_int8 next_smpl_buf_idx_table[ DECISION_DELAY ] +{ + 39, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 +}; + +/* This table records (index % DECISION_DELAY) to avoid the division operation when DECISION_DELAY is not a power of 2. + * This table must be updated if DECISION_DELAY changes. + */ +static const opus_int8 mode_DECISION_DELAY_table[ 3 * DECISION_DELAY ] +{ + /* Repeat 2 times to guarantee no index out of bounds. */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 +}; + +static OPUS_INLINE void copy_winner_state_kernel( + const NSQ_del_decs_struct *psDelDec, + const opus_int offset, + const opus_int last_smple_idx, + const opus_int Winner_ind, + const int32x2_t gain_lo_s32x2, + const int32x2_t gain_hi_s32x2, + const int32x4_t shift_s32x4, + int32x4_t t0_s32x4, + int32x4_t t1_s32x4, + opus_int8 *pulses, + opus_int16 *pxq, + silk_nsq_state *NSQ +) +{ + int16x8_t t_s16x8; + int32x4_t o0_s32x4, o1_s32x4; + + t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 ); + t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) ); + vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) ); + + t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 ); + o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 ); + o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 ); + o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 ); + o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 ); + o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 ); + o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 ); + vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) ); + vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) ); + + t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 ); + t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 ); + t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 ); + vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 ); + vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 ); +} + +static OPUS_INLINE void copy_winner_state( + const NSQ_del_decs_struct *psDelDec, + const opus_int decisionDelay, + const opus_int smpl_buf_idx, + const opus_int Winner_ind, + const opus_int32 gain, + const opus_int32 shift, + opus_int8 *pulses, + opus_int16 *pxq, + silk_nsq_state *NSQ +) +{ + opus_int i, last_smple_idx; + const int32x2_t gain_lo_s32x2 = vdup_n_s32( ( gain & 0x0000FFFF ) << 15 ); + const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 ); + const int32x4_t shift_s32x4 = vdupq_n_s32( -shift ); + int32x4_t t0_s32x4, t1_s32x4; + + t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */ + last_smple_idx = smpl_buf_idx + decisionDelay; + last_smple_idx = mode_DECISION_DELAY_table[ last_smple_idx - 1 + DECISION_DELAY ]; + + for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) { + copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ ); + } + for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) { + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 ); + pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ]; + } + + last_smple_idx += DECISION_DELAY; + for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) { + copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ ); + } + for( ; i < decisionDelay; i++, last_smple_idx-- ) { + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 ); + pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ]; + } +} + +void silk_NSQ_del_dec_neon( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +) +{ + /* The optimization parallelizes the different delay decision states. */ + if( psEncC->nStatesDelayedDecision <= 2 ) { + /* When the number of delay decision states is less than 3, there are penalties using the optimization based on ( MAX_DEL_DEC_STATES == 4 ). + * In this case C function is called instead. + * When the number of delay decision states is 2, it's better to specialize another structure NSQ_del_dec2_struct. (Low priority) + */ + silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, + Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 ); + } + else { + opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; + opus_int smpl_buf_idx, decisionDelay; + const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; + opus_int16 *pxq; + VARDECL( opus_int32, sLTP_Q15 ); + VARDECL( opus_int16, sLTP ); + opus_int32 HarmShapeFIRPacked_Q14; + opus_int offset_Q10; + opus_int32 RDmin_Q10, Gain_Q10; + VARDECL( opus_int32, x_sc_Q10 ); + VARDECL( opus_int32, delayedGain_Q10 ); + VARDECL( NSQ_del_decs_struct, psDelDec ); + int32x4_t t_s32x4; + SAVE_STACK; + + /* Set unvoiced lag to the previous one, overwrite later for voiced */ + lag = NSQ->lagPrev; + + silk_assert( NSQ->prev_gain_Q16 != 0 ); + silk_assert( MAX_DEL_DEC_STATES == 4 ); + silk_assert( DECISION_DELAY == 40 ); /* Table next_smpl_buf_idx_table[] and mode_DECISION_DELAY_table[] is hard coded. */ + + /* Initialize delayed decision states */ + ALLOC( psDelDec, 1, NSQ_del_decs_struct ); + /* Only RandState and RD_Q10 need to be initialized to 0. */ + silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) ); + vst1q_s32( psDelDec->RD_Q10, vld1q_dup_s32( psDelDec->RD_Q10 ) ); + + for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) { + psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3; + } + vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) ); + vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) ); + vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) ); + for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { + vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) ); + } + for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) { + vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) ); + } + + offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; + smpl_buf_idx = 0; /* index of oldest samples */ + + decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length ); + + /* For voiced frames limit the decision delay to lower than the pitch lag */ + if( psIndices->signalType == TYPE_VOICED ) { + opus_int pitch_min = pitchL[ 0 ]; + for( k = 1; k < psEncC->nb_subfr; k++ ) { + pitch_min = silk_min_int( pitch_min, pitchL[ k ] ); + } + decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 ); + } else { + if( lag > 0 ) { + decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 ); + } + } + + if( psIndices->NLSFInterpCoef_Q2 == 4 ) { + LSF_interpolation_flag = 0; + } else { + LSF_interpolation_flag = 1; + } + + ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); + ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); + ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); + /* Set up pointers to start of sub frame */ + pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; + NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + subfr = 0; + for( k = 0; k < psEncC->nb_subfr; k++ ) { + A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; + B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; + AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + + /* Noise shape parameters */ + silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); + HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); + HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); + + NSQ->rewhite_flag = 0; + if( psIndices->signalType == TYPE_VOICED ) { + /* Voiced */ + lag = pitchL[ k ]; + + /* Re-whitening */ + if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { + if( k == 2 ) { + /* RESET DELAYED DECISIONS */ + /* Find winner */ + int32x4_t RD_Q10_s32x4; + RDmin_Q10 = psDelDec->RD_Q10[ 0 ]; + Winner_ind = 0; + for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) { + if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) { + RDmin_Q10 = psDelDec->RD_Q10[ i ]; + Winner_ind = i; + } + } + psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 ); + RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 ); + RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) ); + vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 ); + + /* Copy final part of signals from winner state to output and long-term filter states */ + copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ ); + + subfr = 0; + } + + /* Rewhiten with new A coefs */ + start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; + silk_assert( start_idx > 0 ); + + silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], + A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); + + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + NSQ->rewhite_flag = 1; + } + } + + silk_nsq_del_dec_scale_states( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, + LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); + + silk_noise_shape_quantizer_del_dec( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, + delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], + Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, + psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay ); + + x16 += psEncC->subfr_length; + pulses += psEncC->subfr_length; + pxq += psEncC->subfr_length; + } + + /* Find winner */ + RDmin_Q10 = psDelDec->RD_Q10[ 0 ]; + Winner_ind = 0; + for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) { + if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) { + RDmin_Q10 = psDelDec->RD_Q10[ k ]; + Winner_ind = k; + } + } + + /* Copy final part of signals from winner state to output and long-term filter states */ + psIndices->Seed = psDelDec->SeedInit[ Winner_ind ]; + Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 ); + copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ ); + + silk_assert( !( NSQ_LPC_BUF_LENGTH % 4 ) ); + t_s32x4 = vdupq_n_s32( 0 ); /* initialization */ + for( i = 0; i < NSQ_LPC_BUF_LENGTH; i += 4 ) { + t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 ); + t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 ); + t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 ); + t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 ); + vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 ); + } + + silk_assert( !( ( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ) % 4 ) ); + for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i += 4 ) { + t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 ); + t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 ); + t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 ); + t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 ); + vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 ); + } + + /* Update states */ + NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ]; + NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ]; + NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; + + /* Save quantized speech signal */ + /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[ psEncC->ltp_mem_length ], psEncC->frame_length * sizeof(opus_int16) ) */ + silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof(opus_int16) ); + silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof(opus_int32) ); + RESTORE_STACK; + } +} + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE int32x4_t silk_SMLAWB_lane_neon( + const int32x4_t out_s32x4, + const int32x4_t in_s32x4, + const int32x2_t coef_s32x2, + const opus_int lane +) +{ + return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, lane ) ); +} + +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ + opus_int decisionDelay /* I */ +) +{ + opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; + opus_int32 Winner_rand_state; + opus_int32 LTP_pred_Q14, n_LTP_Q14; + opus_int32 RDmin_Q10, RDmax_Q10; + opus_int32 Gain_Q10; + opus_int32 *pred_lag_ptr, *shp_lag_ptr; + opus_int32 a_Q12_arch[ MAX_LPC_ORDER ]; + const int32x2_t warping_Q16_s32x2 = vdup_n_s32( ( warping_Q16 << 16 ) >> 1 ); + const opus_int32 LF_shp_Q29 = ( LF_shp_Q14 << 16 ) >> 1; + opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ]; + const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER ); + const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT ); + + VARDECL( NSQ_samples_struct, psSampleState ); + SAVE_STACK; + + silk_assert( nStatesDelayedDecision > 0 ); + silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ + ALLOC( psSampleState, 2, NSQ_samples_struct ); + + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; + pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; + Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); + + silk_assert( MAX_SHAPE_LPC_ORDER == 24 ); + { + const int16x8_t t0_s16x8 = vld1q_s16( AR_shp_Q13 + 0 ); + const int16x8_t t1_s16x8 = vld1q_s16( AR_shp_Q13 + 8 ); + const int16x8_t t2_s16x8 = vld1q_s16( AR_shp_Q13 + 16 ); + vst1q_s32( AR_shp_Q28 + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), 15 ) ); + vst1q_s32( AR_shp_Q28 + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), 15 ) ); + vst1q_s32( AR_shp_Q28 + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), 15 ) ); + vst1q_s32( AR_shp_Q28 + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), 15 ) ); + vst1q_s32( AR_shp_Q28 + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), 15 ) ); + vst1q_s32( AR_shp_Q28 + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), 15 ) ); + } + + /* silk_short_prediction_create_arch_coef( a_Q12_arch, a_Q12, predictLPCOrder ); */ + silk_assert( MAX_LPC_ORDER == 16 ); + if( predictLPCOrder == MAX_LPC_ORDER ) { + int16x8_t t0_s16x8, t1_s16x8; + t0_s16x8 = vld1q_s16( a_Q12 + 0 ); /* 7 6 5 4 3 2 1 0 */ + t1_s16x8 = vld1q_s16( a_Q12 + 8 ); /* F E D C B A 9 8 */ + t0_s16x8 = vrev64q_s16( t0_s16x8 ); /* 4 5 6 7 0 1 2 3 */ + t1_s16x8 = vrev64q_s16( t1_s16x8 ); /* C D E F 8 9 A B */ + vst1q_s32( a_Q12_arch + 0, vshll_n_s16( vget_high_s16( t1_s16x8 ), 15 ) ); /* C D E F */ + vst1q_s32( a_Q12_arch + 4, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), 15 ) ); /* 8 9 A B */ + vst1q_s32( a_Q12_arch + 8, vshll_n_s16( vget_high_s16( t0_s16x8 ), 15 ) ); /* 4 5 6 7 */ + vst1q_s32( a_Q12_arch + 12, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), 15 ) ); /* 0 1 2 3 */ + } + else { + int16x8_t t_s16x8; + int16x4_t t_s16x4; + int32x4_t t_s32x4; + silk_assert( predictLPCOrder == MIN_LPC_ORDER ); + t_s16x8 = vld1q_s16( a_Q12 + 0 ); /* 7 6 5 4 3 2 1 0 */ + t_s16x4 = vld1_s16 ( a_Q12 + 8 ); /* B A 9 8 */ + t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */ + t_s16x4 = vrev64_s16( t_s16x4 ); /* 8 9 A B */ + t_s32x4 = vshll_n_s16( t_s16x4, 15 ); + t_s32x4 = vcombine_s32( vdup_n_s32( 0 ), vget_high_s32( t_s32x4 ) ); /* 8 9 zero zero */ + vst1q_s32( a_Q12_arch + 0, vdupq_n_s32( 0 ) ); /* zero zero zero zero */ + vst1q_s32( a_Q12_arch + 4, t_s32x4 ); /* 8 9 zero zero */ + vst1q_s32( a_Q12_arch + 8, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) ); /* 4 5 6 7 */ + vst1q_s32( a_Q12_arch + 12, vshll_n_s16( vget_low_s16 ( t_s16x8 ), 15 ) ); /* 0 1 2 3 */ + } + + for( i = 0; i < length; i++ ) { + int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4; + int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4; + int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4; + int32x2_t AR_shp_Q28_s32x2; + int16x4_t r_Q10_s16x4, rr_Q10_s16x4; + + /* Perform common calculations used in all states */ + + /* Long-term prediction */ + if( signalType == TYPE_VOICED ) { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q14 = 2; + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); + LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */ + pred_lag_ptr++; + } else { + LTP_pred_Q14 = 0; + } + + /* Long-term shaping */ + if( lag > 0 ) { + /* Symmetric, packed FIR coefficients */ + n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ + shp_lag_ptr++; + } else { + n_LTP_Q14 = 0; + } + + /* Generate dither */ + Seed_s32x4 = vld1q_s32( psDelDec->Seed ); + Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) ); + vst1q_s32( psDelDec->Seed, Seed_s32x4 ); + + /* Short-term prediction */ + { + const opus_int32 *buf32 = psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ]; + const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 ); + const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 ); + const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 ); + const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 ); + LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( predictLPCOrder, 1 ) ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch0_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch0_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch1_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch1_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch2_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch2_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch3_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * MAX_DEL_DEC_STATES ), vget_low_s32 ( a_Q12_arch3_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ), 0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ), 1 ); + LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */ + } + + /* Noise shape feedback */ + /* Output of lowpass section */ + tmp2_s32x4 = silk_SMLAWB_lane_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2, 0 ); + /* Output of allpass section */ + tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 ); + tmp1_s32x4 = silk_SMLAWB_lane_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2, 0 ); + vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 ); + AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 ); + n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); + + /* Loop over allpass sections */ + for( j = 2; j < shapingLPCOrder; j += 2 ) { + /* Output of allpass section */ + tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 ); + tmp2_s32x4 = silk_SMLAWB_lane_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2, 0 ); + vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 ); + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); + /* Output of allpass section */ + tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 ); + tmp1_s32x4 = silk_SMLAWB_lane_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2, 0 ); + vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 ); + AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] ); + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); + } + vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 ); + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); + n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */ + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), ( Tilt_Q14 << 16 ) >> 1 ) ); /* Q12 */ + n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */ + n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */ + n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), ( LF_shp_Q14 >> 16 ) << 15 ) ); /* Q12 */ + n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */ + + /* Input minus prediction plus noise feedback */ + /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ + tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */ + tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */ + tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */ + tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */ + tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */ + + /* Flip sign depending on dither */ + sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) ); + tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 ); + tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 ); + tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) ); + tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) ); + r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 ); + + /* Find two quantization level candidates and measure their rate-distortion */ + { + int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) ); + int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 ); + int16x4_t q2_Q10_s16x4; + int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4; + uint32x4_t t_u32x4; + + if( Lambda_Q10 > 2048 ) { + /* For aggressive RDO, the bias becomes more than one pulse. */ + const int rdo_offset = Lambda_Q10/2 - 512; + const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ); + const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) ); + /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */ + silk_assert( Lambda_Q10 <= 32767 ); + + q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) ); + q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 ); + q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 ); + q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 ); + } + { + const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) ); + const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); + const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); + int16x4_t tmp1_s16x4, tmp2_s16x4; + + q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 ); + tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) ); + q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) ); + q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 ); + q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 ); + q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 ); + q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) ); + q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 ); + q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 ); + tmp1_s16x4 = q1_Q10_s16x4; + tmp2_s16x4 = q2_Q10_s16x4; + tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 ); + tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 ); + rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) ); + rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) ); + } + + rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 ); + rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 ); + rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 ); + + rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 ); + rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 ); + rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 ); + + tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 ); + tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) ); + tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) ); + vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 ); + vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 ); + t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ); + tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) ); + tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) ); + vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 ); + vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 ); + } + + { + /* Update states for best quantization */ + int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4; + + /* Quantized excitation */ + exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 ); + exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 ); + exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 ); + + /* Add predictions */ + LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) ); + xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 ); + + /* Update states */ + tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) ); + vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 ); + sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 ); + vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) ); + vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 ); + vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 ); + vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 ); + + /* Quantized excitation */ + exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 ); + exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 ); + exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 ); + + /* Add predictions */ + LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) ); + xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 ); + + /* Update states */ + tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) ); + vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 ); + sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 ); + vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) ); + vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 ); + vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 ); + vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 ); + } + + *smpl_buf_idx = next_smpl_buf_idx_table[ *smpl_buf_idx ]; + last_smple_idx = mode_DECISION_DELAY_table[ *smpl_buf_idx + decisionDelay + DECISION_DELAY ]; + + /* Find winner */ + RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ]; + Winner_ind = 0; + for( k = 1; k < nStatesDelayedDecision; k++ ) { + if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) { + RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ]; + Winner_ind = k; + } + } + + /* Increase RD values of expired states */ + { + uint32x4_t t_u32x4; + Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ]; + t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) ); + t_u32x4 = vmvnq_u32( t_u32x4 ); + t_u32x4 = vshrq_n_u32( t_u32x4, 5 ); + tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 ); + tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 ); + tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) ); + tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) ); + vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 ); + vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 ); + + /* Find worst in first set and best in second set */ + RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ]; + RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ]; + RDmax_ind = 0; + RDmin_ind = 0; + for( k = 1; k < nStatesDelayedDecision; k++ ) { + /* find worst in first set */ + if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) { + RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ]; + RDmax_ind = k; + } + /* find best in second set */ + if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) { + RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ]; + RDmin_ind = k; + } + } + } + + /* Replace a state if best from second set outperforms worst in first set */ + if( RDmin_Q10 < RDmax_Q10 ) { + /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several + * useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. + * Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. + */ + opus_int32 *dst = (opus_int32 *)psDelDec + ( i + 1 ) * MAX_DEL_DEC_STATES + RDmax_ind; + const opus_int32 *src = dst + RDmin_ind - RDmax_ind; + for( j = 0; j < NSQ_LPC_BUF_LENGTH - 1; j++ ) { + dst[ 4 * j ] = src[ 4 * j ]; + } + dst = (opus_int32 *)psDelDec->RandState + RDmax_ind; + src = dst + RDmin_ind - RDmax_ind; + for( j = 0; j < (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *) 0 )->sLPC_Q14 ) ) / ( MAX_DEL_DEC_STATES * sizeof(opus_int32) ) - 9 ); j += 10 ) { + dst[ 4 * ( j + 0 ) ] = src[ 4 * ( j + 0 ) ]; + dst[ 4 * ( j + 1 ) ] = src[ 4 * ( j + 1 ) ]; + dst[ 4 * ( j + 2 ) ] = src[ 4 * ( j + 2 ) ]; + dst[ 4 * ( j + 3 ) ] = src[ 4 * ( j + 3 ) ]; + dst[ 4 * ( j + 4 ) ] = src[ 4 * ( j + 4 ) ]; + dst[ 4 * ( j + 5 ) ] = src[ 4 * ( j + 5 ) ]; + dst[ 4 * ( j + 6 ) ] = src[ 4 * ( j + 6 ) ]; + dst[ 4 * ( j + 7 ) ] = src[ 4 * ( j + 7 ) ]; + dst[ 4 * ( j + 8 ) ] = src[ 4 * ( j + 8 ) ]; + dst[ 4 * ( j + 9 ) ] = src[ 4 * ( j + 9 ) ]; + } + for( ; j < (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *) 0 )->sLPC_Q14 ) ) / ( MAX_DEL_DEC_STATES * sizeof(opus_int32) ) ); j++ ) { + dst[ 4 * j ] = src[ 4 * j ]; + } + psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ]; + psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ]; + psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ]; + psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ]; + psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ]; + psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ]; + psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ]; + } + + /* Write samples from winner to output and long-term filter states */ + if( subfr > 0 || i >= decisionDelay ) { + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 ); + xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( + silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ]; + sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDec->Pred_Q15[ last_smple_idx ][ Winner_ind ]; + } + NSQ->sLTP_shp_buf_idx++; + NSQ->sLTP_buf_idx++; + + /* Update states */ + vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) ); + vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) ); + vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) ); + vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) ); + tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 ); + vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 ); + vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) ); + vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) ); + tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 ); + tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 ); + vst1q_s32( psDelDec->Seed, tmp1_s32x4 ); + vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 ); + vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) ); + delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; + } + /* Update LPC states */ + silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof(opus_int32) ); + + RESTORE_STACK; +} + +static OPUS_INLINE void silk_SMULWB_8_neon( + const opus_int16 *a, + const int32x2_t b, + opus_int32 *o +) +{ + const int16x8_t a_s16x8 = vld1q_s16( a ); + int32x4_t o0_s32x4, o1_s32x4; + + o0_s32x4 = vshll_n_s16( vget_low_s16 ( a_s16x8 ), 15 ); + o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 ); + o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 ); + o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 ); + vst1q_s32( o, o0_s32x4 ); + vst1q_s32( o + 4, o1_s32x4 ); +} + +/* Only works when ( b >= -65536 ) && ( b < 65536 ). */ +static OPUS_INLINE void silk_SMULWW_small_b_4_neon( + const opus_int32 *a, + const int32x2_t b_s32x2, + opus_int32 *o +) +{ + int32x4_t o_s32x4; + + o_s32x4 = vld1q_s32( a ); + o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 ); + vst1q_s32( o, o_s32x4 ); +} + +/* Only works when ( b >= -65536 ) && ( b < 65536 ). */ +static OPUS_INLINE void silk_SMULWW_small_b_8_neon( + const opus_int32 *a, + const int32x2_t b_s32x2, + opus_int32 *o +) +{ + int32x4_t o0_s32x4, o1_s32x4; + + o0_s32x4 = vld1q_s32( a ); + o1_s32x4 = vld1q_s32( a + 4 ); + o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 ); + o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 ); + vst1q_s32( o, o0_s32x4 ); + vst1q_s32( o + 4, o1_s32x4 ); +} + +static OPUS_INLINE void silk_SMULWW_4_neon( + const opus_int32 *a, + const int32x2_t b_s32x2, + opus_int32 *o +) +{ + int32x4_t a_s32x4, o_s32x4; + + a_s32x4 = vld1q_s32( a ); + o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 ); + o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 ); + vst1q_s32( o, o_s32x4 ); +} + +static OPUS_INLINE void silk_SMULWW_8_neon( + const opus_int32 *a, + const int32x2_t b_s32x2, + opus_int32 *o +) +{ + int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4; + + a0_s32x4 = vld1q_s32( a ); + a1_s32x4 = vld1q_s32( a + 4 ); + o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 ); + o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 ); + o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 ); + o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 ); + vst1q_s32( o, o0_s32x4 ); + vst1q_s32( o + 4, o1_s32x4 ); +} + +static OPUS_INLINE void silk_SMULWW_loop_neon( + const opus_int16 *a, + const opus_int32 b, + opus_int32 *o, + const opus_int loop_num +) +{ + opus_int i; + int32x2_t b_s32x2; + + b_s32x2 = vdup_n_s32( b ); + for( i = 0; i < loop_num - 7; i += 8 ) { + silk_SMULWB_8_neon( a + i, b_s32x2, o + i ); + } + for( ; i < loop_num; i++ ) { + o[ i ] = silk_SMULWW( a[ i ], b ); + } +} + +static OPUS_INLINE void silk_nsq_del_dec_scale_states( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ + const opus_int16 x16[], /* I Input */ + opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +) +{ + opus_int i, lag; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; + + lag = pitchL[ subfr ]; + inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); + silk_assert( inv_gain_Q31 != 0 ); + + /* Scale input */ + inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); + silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length ); + + /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ + if( NSQ->rewhite_flag ) { + if( subfr == 0 ) { + /* Do LTP downscaling */ + inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); + } + silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 ); + } + + /* Adjust for changing gain */ + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { + int32x2_t gain_adj_Q16_s32x2; + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + + /* Scale long-term shaping state */ + silk_assert( !( ( MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH ) & 7 ) ); + silk_assert( !( ( MAX_DEL_DEC_STATES * MAX_SHAPE_LPC_ORDER ) & 7 ) ); + silk_assert( !( ( MAX_DEL_DEC_STATES * DECISION_DELAY ) & 7 ) ); + if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) { + gain_adj_Q16_s32x2 = vdup_n_s32( gain_adj_Q16 << 15 ); + for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) { + silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2, NSQ->sLTP_shp_Q14 + i ); + } + for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { + NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); + } + + /* Scale long-term prediction state */ + if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { + for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) { + silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2, sLTP_Q15 + i ); + } + for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { + sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); + } + } + + /* Scale scalar states */ + silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2, psDelDec->LF_AR_Q14 ); + silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2, psDelDec->Diff_Q14 ); + + /* Scale short-term prediction and shaping states */ + opus_int32 *state0, *state1; + state0 = psDelDec->sLPC_Q14[ 0 ]; + for( i = 0; i < MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH; i += 8 ) { + silk_SMULWW_small_b_8_neon( state0 + i, gain_adj_Q16_s32x2, state0 + i ); + } + state0 = psDelDec->sAR2_Q14[ 0 ]; + for( i = 0; i < MAX_DEL_DEC_STATES * MAX_SHAPE_LPC_ORDER; i += 8 ) { + silk_SMULWW_small_b_8_neon( state0 + i, gain_adj_Q16_s32x2, state0 + i ); + } + state0 = psDelDec->Pred_Q15[ 0 ]; + state1 = psDelDec->Shape_Q14[ 0 ]; + for( i = 0; i < MAX_DEL_DEC_STATES * DECISION_DELAY; i += 8 ) { + silk_SMULWW_small_b_8_neon( state0 + i, gain_adj_Q16_s32x2, state0 + i ); + silk_SMULWW_small_b_8_neon( state1 + i, gain_adj_Q16_s32x2, state1 + i ); + } + } + else { + gain_adj_Q16_s32x2 = vdup_n_s32( ( gain_adj_Q16 & 0x0000FFFF ) << 15 ); + gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 ); + for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) { + silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2, NSQ->sLTP_shp_Q14 + i ); + } + for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { + NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); + } + + /* Scale long-term prediction state */ + if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { + for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) { + silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2, sLTP_Q15 + i ); + } + for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { + sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); + } + } + + /* Scale scalar states */ + silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2, psDelDec->LF_AR_Q14 ); + silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2, psDelDec->Diff_Q14 ); + + /* Scale short-term prediction and shaping states */ + opus_int32 *state0, *state1; + state0 = psDelDec->sLPC_Q14[ 0 ]; + for( i = 0; i < MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH; i += 8 ) { + silk_SMULWW_8_neon( state0 + i, gain_adj_Q16_s32x2, state0 + i ); + } + state0 = psDelDec->sAR2_Q14[ 0 ]; + for( i = 0; i < MAX_DEL_DEC_STATES * MAX_SHAPE_LPC_ORDER; i += 8 ) { + silk_SMULWW_8_neon( state0 + i, gain_adj_Q16_s32x2, state0 + i ); + } + state0 = psDelDec->Pred_Q15[ 0 ]; + state1 = psDelDec->Shape_Q14[ 0 ]; + for( i = 0; i < MAX_DEL_DEC_STATES * DECISION_DELAY; i += 8 ) { + silk_SMULWW_8_neon( state0 + i, gain_adj_Q16_s32x2, state0 + i ); + silk_SMULWW_8_neon( state1 + i, gain_adj_Q16_s32x2, state1 + i ); + } + } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; + } +} diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c index b1783c7..8deaf99 100644 --- a/silk/arm/arm_silk_map.c +++ b/silk/arm/arm_silk_map.c @@ -60,6 +60,29 @@ opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O R MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */ }; +void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +) = { + silk_NSQ_del_dec_c, /* ARMv4 */ + silk_NSQ_del_dec_c, /* EDSP */ + silk_NSQ_del_dec_c, /* Media */ + MAY_HAVE_NEON(silk_NSQ_del_dec), /* Neon */ +}; + /*There is no table for silk_noise_shape_quantizer_short_prediction because the NEON version takes different parameters than the C version. Instead RTCD is done via if statements at the call sites. diff --git a/silk/main.h b/silk/main.h index 13d4241..73c94fc 100644 --- a/silk/main.h +++ b/silk/main.h @@ -42,6 +42,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "x86/main_sse.h" #endif +#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "arm/NSQ_del_dec_arm.h" +#endif + /* Convert Left/Right stereo signal to adaptive Mid/Side representation */ void silk_stereo_LR_to_MS( stereo_enc_state *state, /* I/O State */ @@ -269,7 +273,7 @@ void silk_NSQ_c( /* Noise shaping using delayed decision */ void silk_NSQ_del_dec_c( - const silk_encoder_state *psEncC, /* I/O Encoder State */ + const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ diff --git a/silk/mips/NSQ_del_dec_mipsr1.h b/silk/mips/NSQ_del_dec_mipsr1.h index 3ca6464..cd70713 100644 --- a/silk/mips/NSQ_del_dec_mipsr1.h +++ b/silk/mips/NSQ_del_dec_mipsr1.h @@ -61,7 +61,7 @@ static inline void silk_noise_shape_quantizer_del_dec( opus_int predictLPCOrder, /* I Prediction filter order */ opus_int warping_Q16, /* I */ opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ - opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ opus_int decisionDelay, /* I */ int arch /* I */ ) diff --git a/silk/tests/test_unit_optimization_NSQ_del_dec.c b/silk/tests/test_unit_optimization_NSQ_del_dec.c new file mode 100644 index 0000000..572b06a --- /dev/null +++ b/silk/tests/test_unit_optimization_NSQ_del_dec.c @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include <stdio.h> +#include "main_FIX.h" +#include "celt/_kiss_fft_guts.h" +#include "silk/NSQ_del_dec.c" + +#define MIN_nStatesDelayedDecision 1 + +static OPUS_INLINE void init_buffer(void* buffer, int size) +{ + char* tmp = (char*)buffer; + for(int i = 0; i < size; i++) + { + tmp[i] = rand(); + } +} + +static int test_silk_NSQ_del_dec(int arch) +{ + int result = 0; + silk_encoder_state psEncC; + silk_nsq_state NSQ_org, NSQ_opt; + SideInfoIndices psIndices_org, psIndices_opt; + opus_int16 x16[MAX_FRAME_LENGTH]; + opus_int8 pulses_org[MAX_FRAME_LENGTH], pulses_opt[MAX_FRAME_LENGTH]; + opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ]; + opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ]; + opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ]; + opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ]; + opus_int Tilt_Q14[ MAX_NB_SUBFR ]; + opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ]; + opus_int32 Gains_Q16[ MAX_NB_SUBFR ]; + opus_int pitchL[ MAX_NB_SUBFR ]; + opus_int Lambda_Q10; + opus_int LTP_scale_Q14; + opus_int subfr_length; + opus_int shapingLPCOrder; + opus_int nStatesDelayedDecision; + + printf("%50s", "silk_NSQ_del_dec() ..."); + for( subfr_length = DECISION_DELAY; subfr_length <= MAX_SUB_FRAME_LENGTH; subfr_length++ ) + { + for(nStatesDelayedDecision = MIN_nStatesDelayedDecision; nStatesDelayedDecision <= MAX_DEL_DEC_STATES; nStatesDelayedDecision++ ) + { + for( shapingLPCOrder = 12; shapingLPCOrder <= MAX_SHAPE_LPC_ORDER; shapingLPCOrder += 2 ) // shapingLPCOrder must be even. + { + init_buffer(&psEncC, sizeof(psEncC)); + init_buffer(&NSQ_org, sizeof(NSQ_org)); + init_buffer(&psIndices_org, sizeof(psIndices_org)); + init_buffer(pulses_org, sizeof(pulses_org)); + init_buffer(x16, sizeof(x16)); + init_buffer(PredCoef_Q12, sizeof(PredCoef_Q12)); + init_buffer(LTPCoef_Q14, sizeof(LTPCoef_Q14)); + init_buffer(AR_Q13, sizeof(AR_Q13)); + init_buffer(HarmShapeGain_Q14, sizeof(HarmShapeGain_Q14)); + init_buffer(Tilt_Q14, sizeof(Tilt_Q14)); + init_buffer(LF_shp_Q14, sizeof(LF_shp_Q14)); + init_buffer(Gains_Q16, sizeof(Gains_Q16)); + + psEncC.subfr_length = subfr_length; + psEncC.nStatesDelayedDecision = nStatesDelayedDecision; + psEncC.shapingLPCOrder = shapingLPCOrder; + pitchL[0] = rand() % 289; + pitchL[0] = MAX(pitchL[0], 80); // Restrict to value range [80, 288] + for( int i = 1; i < MAX_NB_SUBFR; i++) + { + // The following sub frame pitchL cannot have big difference from pitchL[0]. Otherwise sLTP_Q15[] in silk_nsq_del_dec_scale_states() will access uninitialized values. + pitchL[i] = pitchL[0]; + pitchL[i] += rand() % 6; + pitchL[i] -= rand() % 6; + } + NSQ_org.lagPrev = rand() % 289; + NSQ_org.lagPrev = MAX( NSQ_org.lagPrev, 80); + Lambda_Q10 = rand() % 32768; + LTP_scale_Q14 = rand(); + psEncC.predictLPCOrder = (rand() & 1) ? MIN_LPC_ORDER : MAX_LPC_ORDER; + psEncC.warping_Q16 = rand() % 32767; + psEncC.arch = arch; + psEncC.nb_subfr = 4; + psEncC.frame_length = MAX_FRAME_LENGTH; //psEncC.nb_subfr * subfr_length; + psEncC.ltp_mem_length = psEncC.frame_length;//LTP_MEM_LENGTH_MS * 8; //??? + psIndices_org.signalType = rand() % (TYPE_VOICED + 1); + psIndices_org.quantOffsetType = rand() & 1; + psIndices_org.NLSFInterpCoef_Q2 = rand() & 4; + + memcpy(&NSQ_opt, &NSQ_org, sizeof(NSQ_org)); + memcpy(&psIndices_opt, &psIndices_org, sizeof(psIndices_org)); + memcpy(pulses_opt, pulses_org, sizeof(pulses_org)); + + silk_NSQ_del_dec_c(&psEncC, &NSQ_org, &psIndices_org, x16, pulses_org, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); + silk_NSQ_del_dec (&psEncC, &NSQ_opt, &psIndices_opt, x16, pulses_opt, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch); + + if (memcmp(&NSQ_opt, &NSQ_org, sizeof(NSQ_org))) { printf("NSQ_org different!\n"); result = -1; } + if (memcmp(&psIndices_opt, &psIndices_org, sizeof(psIndices_org))) { printf("psIndices different!\n"); result = -1; } + if (memcmp(pulses_opt, pulses_org, sizeof(pulses_org))) { printf("pulses different!\n"); result = -1; } + if (result) + { + printf("subfr_length=%3d, nStatesDelayedDecision=%d, shapingLPCOrder=%2d, psEncC.predictLPCOrder=%d failed!\n", subfr_length, nStatesDelayedDecision, shapingLPCOrder, psEncC.predictLPCOrder); + return result; + } + } + } + } + + printf(" passed!\n"); + return result; +} diff --git a/silk/x86/NSQ_del_dec_sse.c b/silk/x86/NSQ_del_dec_sse.c index a6f84e1..29dd872 100644 --- a/silk/x86/NSQ_del_dec_sse.c +++ b/silk/x86/NSQ_del_dec_sse.c @@ -107,12 +107,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( opus_int predictLPCOrder, /* I Prediction filter order */ opus_int warping_Q16, /* I */ opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ - opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ opus_int decisionDelay /* I */ ); void silk_NSQ_del_dec_sse4_1( - const silk_encoder_state *psEncC, /* I/O Encoder State */ + const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int32 x_Q3[], /* I Prefiltered input signal */ @@ -335,7 +335,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( opus_int predictLPCOrder, /* I Prediction filter order */ opus_int warping_Q16, /* I */ opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ - opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ opus_int decisionDelay /* I */ ) { diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h index a221f31..42a6c70 100644 --- a/silk/x86/main_sse.h +++ b/silk/x86/main_sse.h @@ -140,7 +140,7 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( # define OVERRIDE_silk_NSQ_del_dec void silk_NSQ_del_dec_sse4_1( - const silk_encoder_state *psEncC, /* I/O Encoder State */ + const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int32 x_Q3[], /* I Prefiltered input signal */ @@ -167,7 +167,7 @@ void silk_NSQ_del_dec_sse4_1( #else extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( - const silk_encoder_state *psEncC, /* I/O Encoder State */ + const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int32 x_Q3[], /* I Prefiltered input signal */ diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c index 6a1d75c..e69da60 100644 --- a/silk/x86/x86_silk_map.c +++ b/silk/x86/x86_silk_map.c @@ -116,7 +116,7 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( #if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( - const silk_encoder_state *psEncC, /* I/O Encoder State */ + const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int32 x_Q3[], /* I Prefiltered input signal */ diff --git a/silk_sources.mk b/silk_sources.mk index d2d5b35..9dcfe83 100644 --- a/silk_sources.mk +++ b/silk_sources.mk @@ -87,6 +87,7 @@ SILK_SOURCES_ARM_NEON_INTR = \ silk/arm/arm_silk_map.c \ silk/arm/LPC_analysis_filter_neon_intr.c \ silk/arm/LPC_inv_pred_gain_neon_intr.c \ +silk/arm/NSQ_del_dec_neon_intr.c \ silk/arm/NSQ_neon.c SILK_SOURCES_FIXED = \ diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c index 55425c4..19fda42 100644 --- a/tests/test_unit_optimization.c +++ b/tests/test_unit_optimization.c @@ -47,6 +47,7 @@ # include "silk/tests/test_unit_optimization_LPC_analysis_filter.c" # include "silk/tests/test_unit_optimization_LPC_inv_pred_gain.c" +# include "silk/tests/test_unit_optimization_NSQ_del_dec.c" #define NUM_UNIT_TEST_LOOP 10 @@ -68,6 +69,7 @@ int main(void) #endif /* FIXED_POINT */ result |= test_silk_LPC_analysis_filter(arch); result |= test_silk_LPC_inverse_pred_gain(arch); + result |= test_silk_NSQ_del_dec(arch); } return result; } -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Aug-23 17:27 UTC
[opus] [PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
I'm attaching my previous 6 NEON optimization patches, since Patch 7 & 8 are dependents of them. They have been rebased with today's master. All optimizations are bit-exact with C functions. Linfeng -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20160823/e94fc8fa/attachment-0001.html> -------------- next part -------------- A non-text attachment was scrubbed... Name: 0001-0006-NEON-Patches.zip Type: application/zip Size: 38465 bytes Desc: not available URL: <http://lists.xiph.org/pipermail/opus/attachments/20160823/e94fc8fa/attachment-0001.zip>
Possibly Parallel Threads
- [PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
- [AArch64 neon intrinsics v4 0/5] Rework Neon intrinsic code for Aarch64 patchset
- [Aarch64 v2 00/18] Patches to enable Aarch64 (version 2)
- [PATCH 0/8] Patches for arm64 (aarch64) support
- [Aarch64 00/11] Patches to enable Aarch64 (arm64) optimizations, rebased to current master.