Displaying 8 results from an estimated 8 matches for "t_s32x4".
2017 Apr 26
2
2 patches related to silk_biquad_alt() optimization
...ut32_Q14[ 1 ] + (1<<14) - 1, 14 ) );
}
}
}
Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the
multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]:
static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
*out32_Q14_s32x2)
{
int32x2_t t_s32x2;
*out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
/* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] )
*/
*S_s32x2 = vreinterpret_s32_u64(vshr_n_
u64(vreinterpret_u64_...
2017 May 15
2
2 patches related to silk_biquad_alt() optimization
...}
>
> Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the
> multiplication and rounding, where A_Q28_s32x{2,4} stores doubled
> -A_Q28[]:
>
> static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
> A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
> *out32_Q14_s32x2)
> {
> int32x2_t t_s32x2;
>
> *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
> /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ]
> )...
2017 May 08
0
2 patches related to silk_biquad_alt() optimization
...> }
> }
> }
>
> Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the
> multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]:
>
> static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
> A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
> *out32_Q14_s32x2)
> {
> int32x2_t t_s32x2;
>
> *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
> /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] )
> */
> *S_s32x2 = vreinte...
2017 May 17
0
2 patches related to silk_biquad_alt() optimization
...e is the NEON kernels which uses vqrdmulh_lane_s32() to do the
> > multiplication and rounding, where A_Q28_s32x{2,4} stores doubled
> > -A_Q28[]:
> >
> > static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
> > A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
> > *out32_Q14_s32x2)
> > {
> > int32x2_t t_s32x2;
> >
> > *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
> > /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ]
> >...
2017 Apr 25
2
2 patches related to silk_biquad_alt() optimization
On Mon, Apr 24, 2017 at 5:52 PM, Jean-Marc Valin <jmvalin at jmvalin.ca> wrote:
> On 24/04/17 08:03 PM, Linfeng Zhang wrote:
> > Tested on my chromebook, when stride (channel) == 1, the optimization
> > has no gain compared with C function.
>
> You mean that the Neon code is the same speed as the C code for
> stride==1? This is not terribly surprising for an IIRC
2016 Aug 23
0
[PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
...(
+ const NSQ_del_decs_struct *psDelDec,
+ const opus_int offset,
+ const opus_int last_smple_idx,
+ const opus_int Winner_ind,
+ const int32x2_t gain_lo_s32x2,
+ const int32x2_t gain_hi_s32x2,
+ const int32x4_t shift_s32x4,
+ int32x4_t t0_s32x4,
+ int32x4_t t1_s32x4,
+ opus_int8 *pulses,
+ opus_int16 *pxq,
+ silk_nsq_state *NSQ
+)
+{
+ int16x8_t t_s16x8;
+ int32x4_t o0_s32x4, o1_s32x4;
+
+ t0_s32x4 = vld1q_lane_s32( &...
2016 Aug 23
2
[PATCH 7/8] Update NSQ_LPC_BUF_LENGTH macro.
NSQ_LPC_BUF_LENGTH is independent of DECISION_DELAY.
---
silk/define.h | 4 ----
1 file changed, 4 deletions(-)
diff --git a/silk/define.h b/silk/define.h
index 781cfdc..1286048 100644
--- a/silk/define.h
+++ b/silk/define.h
@@ -173,11 +173,7 @@ extern "C"
#define MAX_MATRIX_SIZE MAX_LPC_ORDER /* Max of LPC Order and LTP order */
-#if( MAX_LPC_ORDER >
2016 Jul 14
6
Several patches of ARM NEON optimization
I rebased my previous 3 patches to the current master with minor changes.
Patches 1 to 3 replace all my previous submitted patches.
Patches 4 and 5 are new.
Thanks,
Linfeng Zhang