Displaying 5 results from an estimated 5 matches for "out32_q14_s32x2".
2017 Apr 26
2
2 patches related to silk_biquad_alt() optimization
...);
}
}
}
Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the
multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]:
static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
*out32_Q14_s32x2)
{
int32x2_t t_s32x2;
*out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
/* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] )
*/
*S_s32x2 = vreinterpret_s32_u64(vshr_n_
u64(vreinterpret_u64_s32(*S_s32x2), 32)); /* S[ 0 ] = S[ 1 ]; S[ 1 ]...
2017 May 15
2
2 patches related to silk_biquad_alt() optimization
...ch uses vqrdmulh_lane_s32() to do the
> multiplication and rounding, where A_Q28_s32x{2,4} stores doubled
> -A_Q28[]:
>
> static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
> A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
> *out32_Q14_s32x2)
> {
> int32x2_t t_s32x2;
>
> *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
> /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ]
> ) */
> *S_s32x2 =
> vreinter...
2017 May 08
0
2 patches related to silk_biquad_alt() optimization
...re is the NEON kernels which uses vqrdmulh_lane_s32() to do the
> multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]:
>
> static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
> A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
> *out32_Q14_s32x2)
> {
> int32x2_t t_s32x2;
>
> *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
> /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] )
> */
> *S_s32x2 = vreinterpret_s32_u64(vshr_n_u6
> 4(vreinterpret_u64_s32(*...
2017 May 17
0
2 patches related to silk_biquad_alt() optimization
...o do the
> > multiplication and rounding, where A_Q28_s32x{2,4} stores doubled
> > -A_Q28[]:
> >
> > static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
> > A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
> > *out32_Q14_s32x2)
> > {
> > int32x2_t t_s32x2;
> >
> > *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
> > /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ]
> > ) */
> > *S_s...
2017 Apr 25
2
2 patches related to silk_biquad_alt() optimization
On Mon, Apr 24, 2017 at 5:52 PM, Jean-Marc Valin <jmvalin at jmvalin.ca> wrote:
> On 24/04/17 08:03 PM, Linfeng Zhang wrote:
> > Tested on my chromebook, when stride (channel) == 1, the optimization
> > has no gain compared with C function.
>
> You mean that the Neon code is the same speed as the C code for
> stride==1? This is not terribly surprising for an IIRC