Linfeng Zhang
2017-Apr-26 21:15 UTC
[opus] 2 patches related to silk_biquad_alt() optimization
On Tue, Apr 25, 2017 at 10:31 PM, Jean-Marc Valin <jmvalin at jmvalin.ca> wrote:> > > A_Q28 is split to 2 14-bit (or 16-bit, whatever) integers, to make the > > multiplication operation within 32-bits. NEON can do 32-bit x 32-bit > > 64-bit using 'int64x2_t vmull_s32(int32x2_t a, int32x2_t b)', and it > > could possibly be faster and less rounding/shifting errors than above C > > code. But it may increase difficulties for other CPUs not supporting > > 32-bit multiplication. > > OK, so I'm not totally opposed to that, but it increases the > testing/maintenance cost so it needs to be worth it. So the question is > how much speedup can you get and how close you can make the result to > the original function. If you can make the output be always within one > of two LSBs of the C version, then the asm check can simply be a little > bit more lax than usual. Otherwise it becomes more complicated. This > isn't a function that scares me too much about going non-bitexact, but > it's also not one of the big complexity costs either. In any case, let > me know what you find. >Tested the proposed NEON optimization change, it can increase the whole encoder speed by about 1% at Complexity 8 and 10 for stride = 2 (compared to the NEON optimization in the previous patch), and 0.7% at Complexity 8 and 1.1% at Complexity 10 for stride = 1 (compared to the original C code). Unfortunately, the truncating difference accumulates in S[] and its difference cannot be easily bounded. The difference in out[] may somehow be bounded to 5 in my quick testing, but is not guaranteed to other inputs. So maybe comparing bit exactness with the following silk_biquad_alt_c_MulSingleAQ28() is better. Please let me know the decision (whether keeping the original NEON (stride 2 only) or choosing the new NEON (both stride 1 and 2) which optimizes following silk_biquad_alt_c_MulSingleAQ28()), and I'll wrap up the patch. Here attached the corresponding C code silk_biquad_alt_c_MulSingleAQ28() for your information. It uses 64-bit multiplications and is about 0.6% - 0.9% slower (for the whole encoder) than the original C code using 32-bit multiplications (for both strides and the same stride 2 unrolling). void silk_biquad_alt_c_MulSingleAQ28( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [2*stride] */ opus_int16 *out, /* O output signal */ const opus_int32 len, /* I signal length (must be even) */ opus_int stride /* I Operate on interleaved signal if > 1 */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k; silk_assert( ( stride == 1 ) || ( stride == 2 ) ); if( stride == 1) { opus_int32 out32_Q14; for( k = 0; k < len; k++ ) { /* S[ 0 ], S[ 1 ]: Q12 */ out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] ), 2 ); S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[ 0 ]), 30 ); S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k ] ); S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[ 1 ]) , 30 ); S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k ] ); /* Scale back to Q0 and saturate */ out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + (1<<14) - 1, 14 ) ); } } else { opus_int32 out32_Q14[ 2 ]; for( k = 0; k < len; k++ ) { /* S[ 0 ], S[ 1 ]: Q12 */ out32_Q14[ 0 ] = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k * 2 + 0 ] ), 2 ); out32_Q14[ 1 ] = silk_LSHIFT( silk_SMLAWB( S[ 2 ], B_Q28[ 0 ], in[ k * 2 + 1 ] ), 2 ); S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 ] * (-A_Q28[ 0 ]), 30 ); S[ 2 ] = S[ 3 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 ] * (-A_Q28[ 0 ]), 30 ); S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k * 2 + 0 ] ); S[ 2 ] = silk_SMLAWB( S[ 2 ], B_Q28[ 1 ], in[ k * 2 + 1 ] ); S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 ] * (-A_Q28[ 1 ]), 30 ); S[ 3 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 ] * (-A_Q28[ 1 ]), 30 ); S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k * 2 + 0 ] ); S[ 3 ] = silk_SMLAWB( S[ 3 ], B_Q28[ 2 ], in[ k * 2 + 1 ] ); /* Scale back to Q0 and saturate */ out[ k * 2 + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14[ 0 ] + (1<<14) - 1, 14 ) ); out[ k * 2 + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14[ 1 ] + (1<<14) - 1, 14 ) ); } } } Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]: static inline void silk_biquad_alt_stride1_kernel(const int32x2_t A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t *out32_Q14_s32x2) { int32x2_t t_s32x2; *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4)); /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] ) */ *S_s32x2 = vreinterpret_s32_u64(vshr_n_ u64(vreinterpret_u64_s32(*S_s32x2), 32)); /* S[ 0 ] = S[ 1 ]; S[ 1 ] = 0; */ *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2); /* out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] ), 2 ); */ t_s32x2 = vqrdmulh_lane_s32(A_Q28_s32x2, *out32_Q14_s32x2, 0); /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[ {0,1} ]), 30 ) */ *S_s32x2 = vadd_s32(*S_s32x2, t_s32x2); /* S[ {0,1} ] = {S[ 1 ],0} + silk_RSHIFT_ROUND( ); */ *S_s32x2 = vadd_s32(*S_s32x2, vget_high_s32(t_s32x4)); /* S[ {0,1} ] = silk_SMLAWB( S[ {0,1} ], B_Q28[ {1,2} ], in[ k ] ); */ } static inline void silk_biquad_alt_stride2_kernel(const int32x4_t A_Q28_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t inval_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2) { int32x4_t t_s32x4, out32_Q14_s32x4; *out32_Q14_s32x2 = vadd_s32(vget_low_s32(*S_s32x4), t_s32x2); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k * 2 + {0,1} ] ) */ *S_s32x4 = vcombine_s32(vget_high_s32(*S_s32x4), vdup_n_s32(0)); /* S{0,1} = S{2,3}; S{2,3} = 0; */ *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k * 2 + {0,1} ] ), 2 ); */ out32_Q14_s32x4 = vcombine_s32(*out32_Q14_s32x2, *out32_Q14_s32x2); /* out32_Q14_{0,1,0,1} */ t_s32x4 = vqrdmulhq_s32(out32_Q14_s32x4, A_Q28_s32x4); /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ {0,1,0,1} ] * (-A_Q28[ {0,0,1,1} ]), 30 ) */ *S_s32x4 = vaddq_s32(*S_s32x4, t_s32x4); /* S[ {0,1,2,3} ] = {S[ {2,3} ],0,0} + silk_RSHIFT_ROUND( ); */ t_s32x4 = vqdmulhq_s32(inval_s32x4, B_Q28_s32x4); /* silk_SMULWB(B_Q28[ {1,1,2,2} ], in[ k * 2 + {0,1,0,1} ] ) */ *S_s32x4 = vaddq_s32(*S_s32x4, t_s32x4); /* S[ {0,1,2,3} ] = silk_SMLAWB( S[ {0,1,2,3} ], B_Q28[ {1,1,2,2} ], in[ k * 2 + {0,1,0,1} ] ); */ } Thanks, Linfeng -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20170426/0a60318a/attachment-0001.html>
Linfeng Zhang
2017-May-08 16:12 UTC
[opus] 2 patches related to silk_biquad_alt() optimization
Ping for comments. Thanks, Linfeng On Wed, Apr 26, 2017 at 2:15 PM, Linfeng Zhang <linfengz at google.com> wrote:> On Tue, Apr 25, 2017 at 10:31 PM, Jean-Marc Valin <jmvalin at jmvalin.ca> > wrote: > >> >> > A_Q28 is split to 2 14-bit (or 16-bit, whatever) integers, to make the >> > multiplication operation within 32-bits. NEON can do 32-bit x 32-bit >> > 64-bit using 'int64x2_t vmull_s32(int32x2_t a, int32x2_t b)', and it >> > could possibly be faster and less rounding/shifting errors than above C >> > code. But it may increase difficulties for other CPUs not supporting >> > 32-bit multiplication. >> >> OK, so I'm not totally opposed to that, but it increases the >> testing/maintenance cost so it needs to be worth it. So the question is >> how much speedup can you get and how close you can make the result to >> the original function. If you can make the output be always within one >> of two LSBs of the C version, then the asm check can simply be a little >> bit more lax than usual. Otherwise it becomes more complicated. This >> isn't a function that scares me too much about going non-bitexact, but >> it's also not one of the big complexity costs either. In any case, let >> me know what you find. >> > > Tested the proposed NEON optimization change, it can increase the whole > encoder speed by about 1% at Complexity 8 and 10 for stride = 2 (compared > to the NEON optimization in the previous patch), and 0.7% at Complexity 8 > and 1.1% at Complexity 10 for stride = 1 (compared to the original C code). > > Unfortunately, the truncating difference accumulates in S[] and its > difference cannot be easily bounded. The difference in out[] may somehow be > bounded to 5 in my quick testing, but is not guaranteed to other inputs. So > maybe comparing bit exactness with the following silk_biquad_alt_c_ > MulSingleAQ28() is better. > > Please let me know the decision (whether keeping the original NEON (stride > 2 only) or choosing the new NEON (both stride 1 and 2) which optimizes > following silk_biquad_alt_c_MulSingleAQ28()), and I'll wrap up the patch. > > Here attached the corresponding C code silk_biquad_alt_c_MulSingleAQ28() for > your information. It uses 64-bit multiplications and is about 0.6% - 0.9% > slower (for the whole encoder) than the original C code using 32-bit > multiplications (for both strides and the same stride 2 unrolling). > > void silk_biquad_alt_c_MulSingleAQ28( > const opus_int16 *in, /* I input signal > */ > const opus_int32 *B_Q28, /* I MA > coefficients [3] */ > const opus_int32 *A_Q28, /* I AR > coefficients [2] */ > opus_int32 *S, /* I/O State vector > [2*stride] */ > opus_int16 *out, /* O output signal > */ > const opus_int32 len, /* I signal length > (must be even) */ > opus_int stride /* I Operate on > interleaved signal if > 1 */ > ) > { > /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ > opus_int k; > > silk_assert( ( stride == 1 ) || ( stride == 2 ) ); > > if( stride == 1) { > opus_int32 out32_Q14; > for( k = 0; k < len; k++ ) { > /* S[ 0 ], S[ 1 ]: Q12 */ > out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ > k ] ), 2 ); > > S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * > (-A_Q28[ 0 ]), 30 ); > S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k ] ); > > S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[ > 1 ]) , 30 ); > S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k ] ); > > /* Scale back to Q0 and saturate */ > out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + > (1<<14) - 1, 14 ) ); > } > } else { > opus_int32 out32_Q14[ 2 ]; > for( k = 0; k < len; k++ ) { > /* S[ 0 ], S[ 1 ]: Q12 */ > out32_Q14[ 0 ] = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], > in[ k * 2 + 0 ] ), 2 ); > out32_Q14[ 1 ] = silk_LSHIFT( silk_SMLAWB( S[ 2 ], B_Q28[ 0 ], > in[ k * 2 + 1 ] ), 2 ); > > S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 > ] * (-A_Q28[ 0 ]), 30 ); > S[ 2 ] = S[ 3 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 > ] * (-A_Q28[ 0 ]), 30 ); > S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k * 2 + 0 ] ); > S[ 2 ] = silk_SMLAWB( S[ 2 ], B_Q28[ 1 ], in[ k * 2 + 1 ] ); > > S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 ] * > (-A_Q28[ 1 ]), 30 ); > S[ 3 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 ] * > (-A_Q28[ 1 ]), 30 ); > S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k * 2 + 0 ] ); > S[ 3 ] = silk_SMLAWB( S[ 3 ], B_Q28[ 2 ], in[ k * 2 + 1 ] ); > > /* Scale back to Q0 and saturate */ > out[ k * 2 + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( > out32_Q14[ 0 ] + (1<<14) - 1, 14 ) ); > out[ k * 2 + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( > out32_Q14[ 1 ] + (1<<14) - 1, 14 ) ); > } > } > } > > Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the > multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]: > > static inline void silk_biquad_alt_stride1_kernel(const int32x2_t > A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t > *out32_Q14_s32x2) > { > int32x2_t t_s32x2; > > *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4)); > /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] ) > */ > *S_s32x2 = vreinterpret_s32_u64(vshr_n_u6 > 4(vreinterpret_u64_s32(*S_s32x2), 32)); /* S[ 0 ] = S[ 1 ]; S[ 1 ] = 0; > */ > *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2); > /* out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ > 0 ], in[ k ] ), 2 ); */ > t_s32x2 = vqrdmulh_lane_s32(A_Q28_s32x2, *out32_Q14_s32x2, > 0); /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[ > {0,1} ]), 30 ) */ > *S_s32x2 = vadd_s32(*S_s32x2, t_s32x2); > /* S[ {0,1} ] = {S[ 1 ],0} + silk_RSHIFT_ROUND( ); > */ > *S_s32x2 = vadd_s32(*S_s32x2, vget_high_s32(t_s32x4)); > /* S[ {0,1} ] = silk_SMLAWB( S[ {0,1} ], B_Q28[ {1,2} ], > in[ k ] ); */ > } > > static inline void silk_biquad_alt_stride2_kernel(const int32x4_t > A_Q28_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const > int32x4_t inval_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2) > { > int32x4_t t_s32x4, out32_Q14_s32x4; > > *out32_Q14_s32x2 = vadd_s32(vget_low_s32(*S_s32x4), t_s32x2); > /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k * 2 + {0,1} ] ) > */ > *S_s32x4 = vcombine_s32(vget_high_s32(*S_s32x4), > vdup_n_s32(0)); /* S{0,1} = S{2,3}; S{2,3} = 0; > */ > *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2); > /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k > * 2 + {0,1} ] ), 2 ); */ > out32_Q14_s32x4 = vcombine_s32(*out32_Q14_s32x2, *out32_Q14_s32x2); > /* out32_Q14_{0,1,0,1} > */ > t_s32x4 = vqrdmulhq_s32(out32_Q14_s32x4, A_Q28_s32x4); > /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ {0,1,0,1} ] * (-A_Q28[ > {0,0,1,1} ]), 30 ) */ > *S_s32x4 = vaddq_s32(*S_s32x4, t_s32x4); > /* S[ {0,1,2,3} ] = {S[ {2,3} ],0,0} + silk_RSHIFT_ROUND( ); > */ > t_s32x4 = vqdmulhq_s32(inval_s32x4, B_Q28_s32x4); > /* silk_SMULWB(B_Q28[ {1,1,2,2} ], in[ k * 2 + {0,1,0,1} ] ) > */ > *S_s32x4 = vaddq_s32(*S_s32x4, t_s32x4); > /* S[ {0,1,2,3} ] = silk_SMLAWB( S[ {0,1,2,3} ], B_Q28[ {1,1,2,2} ], in[ > k * 2 + {0,1,0,1} ] ); */ > } > > Thanks, > Linfeng >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20170508/0511f3aa/attachment-0001.html>
Jean-Marc Valin
2017-May-15 16:36 UTC
[opus] 2 patches related to silk_biquad_alt() optimization
Hi Linfeng, Sorry for the delay -- I was actually trying to think of the best option here. For now, my preference would be to keep things bit-exact, but should there be more similar optimizations relying on 64-bit multiplication results, then we could consider having a special option to enable those (even in C). Cheers, Jean-Marc On 08/05/17 12:12 PM, Linfeng Zhang wrote:> Ping for comments. > > Thanks, > Linfeng > > On Wed, Apr 26, 2017 at 2:15 PM, Linfeng Zhang <linfengz at google.com > <mailto:linfengz at google.com>> wrote: > > On Tue, Apr 25, 2017 at 10:31 PM, Jean-Marc Valin > <jmvalin at jmvalin.ca <mailto:jmvalin at jmvalin.ca>> wrote: > > > > A_Q28 is split to 2 14-bit (or 16-bit, whatever) integers, to make the > > multiplication operation within 32-bits. NEON can do 32-bit x 32-bit > > 64-bit using 'int64x2_t vmull_s32(int32x2_t a, int32x2_t b)', and it > > could possibly be faster and less rounding/shifting errors than above C > > code. But it may increase difficulties for other CPUs not supporting > > 32-bit multiplication. > > OK, so I'm not totally opposed to that, but it increases the > testing/maintenance cost so it needs to be worth it. So the > question is > how much speedup can you get and how close you can make the > result to > the original function. If you can make the output be always > within one > of two LSBs of the C version, then the asm check can simply be a > little > bit more lax than usual. Otherwise it becomes more complicated. This > isn't a function that scares me too much about going > non-bitexact, but > it's also not one of the big complexity costs either. In any > case, let > me know what you find. > > > Tested the proposed NEON optimization change, it can increase the > whole encoder speed by about 1% at Complexity 8 and 10 for stride > 2 (compared to the NEON optimization in the previous patch), and > 0.7% at Complexity 8 and 1.1% at Complexity 10 for stride = 1 > (compared to the original C code). > > Unfortunately, the truncating difference accumulates in S[] and its > difference cannot be easily bounded. The difference in out[] may > somehow be bounded to 5 in my quick testing, but is not guaranteed > to other inputs. So maybe comparing bit exactness with the > following silk_biquad_alt_c_MulSingleAQ28() is better. > > Please let me know the decision (whether keeping the original NEON > (stride 2 only) or choosing the new NEON (both stride 1 and 2) which > optimizes following silk_biquad_alt_c_MulSingleAQ28()), and I'll > wrap up the patch. > > Here attached the corresponding C > code silk_biquad_alt_c_MulSingleAQ28() for your information. It uses > 64-bit multiplications and is about 0.6% - 0.9% slower (for the > whole encoder) than the original C code using 32-bit multiplications > (for both strides and the same stride 2 unrolling). > > void silk_biquad_alt_c_MulSingleAQ28( > const opus_int16 *in, /* I input > signal */ > const opus_int32 *B_Q28, /* I MA > coefficients [3] */ > const opus_int32 *A_Q28, /* I AR > coefficients [2] */ > opus_int32 *S, /* I/O State > vector [2*stride] */ > opus_int16 *out, /* O output > signal */ > const opus_int32 len, /* I signal > length (must be even) */ > opus_int stride /* I Operate > on interleaved signal if > 1 */ > ) > { > /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ > opus_int k; > > silk_assert( ( stride == 1 ) || ( stride == 2 ) ); > > if( stride == 1) { > opus_int32 out32_Q14; > for( k = 0; k < len; k++ ) { > /* S[ 0 ], S[ 1 ]: Q12 */ > out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 > ], in[ k ] ), 2 ); > > S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( > (opus_int64)out32_Q14 * (-A_Q28[ 0 ]), 30 ); > S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k ] ); > > S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * > (-A_Q28[ 1 ]) , 30 ); > S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k ] ); > > /* Scale back to Q0 and saturate */ > out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT( > out32_Q14 + (1<<14) - 1, 14 ) ); > } > } else { > opus_int32 out32_Q14[ 2 ]; > for( k = 0; k < len; k++ ) { > /* S[ 0 ], S[ 1 ]: Q12 */ > out32_Q14[ 0 ] = silk_LSHIFT( silk_SMLAWB( S[ 0 ], > B_Q28[ 0 ], in[ k * 2 + 0 ] ), 2 ); > out32_Q14[ 1 ] = silk_LSHIFT( silk_SMLAWB( S[ 2 ], > B_Q28[ 0 ], in[ k * 2 + 1 ] ), 2 ); > > S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( > (opus_int64)out32_Q14[ 0 ] * (-A_Q28[ 0 ]), 30 ); > S[ 2 ] = S[ 3 ] + silk_RSHIFT_ROUND( > (opus_int64)out32_Q14[ 1 ] * (-A_Q28[ 0 ]), 30 ); > S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k * 2 + 0 ] ); > S[ 2 ] = silk_SMLAWB( S[ 2 ], B_Q28[ 1 ], in[ k * 2 + 1 ] ); > > S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 ] * > (-A_Q28[ 1 ]), 30 ); > S[ 3 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 ] * > (-A_Q28[ 1 ]), 30 ); > S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k * 2 + 0 ] ); > S[ 3 ] = silk_SMLAWB( S[ 3 ], B_Q28[ 2 ], in[ k * 2 + 1 ] ); > > /* Scale back to Q0 and saturate */ > out[ k * 2 + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( > out32_Q14[ 0 ] + (1<<14) - 1, 14 ) ); > out[ k * 2 + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( > out32_Q14[ 1 ] + (1<<14) - 1, 14 ) ); > } > } > } > > Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the > multiplication and rounding, where A_Q28_s32x{2,4} stores doubled > -A_Q28[]: > > static inline void silk_biquad_alt_stride1_kernel(const int32x2_t > A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t > *out32_Q14_s32x2) > { > int32x2_t t_s32x2; > > *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4)); > /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] > ) */ > *S_s32x2 > vreinterpret_s32_u64(vshr_n_u64(vreinterpret_u64_s32(*S_s32x2), > 32)); /* S[ 0 ] = S[ 1 ]; S[ 1 ] = 0; > */ > *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2); > /* out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ > 0 ], B_Q28[ 0 ], in[ k ] ), 2 ); */ > t_s32x2 = vqrdmulh_lane_s32(A_Q28_s32x2, > *out32_Q14_s32x2, 0); /* silk_RSHIFT_ROUND( > (opus_int64)out32_Q14 * (-A_Q28[ {0,1} ]), 30 ) */ > *S_s32x2 = vadd_s32(*S_s32x2, t_s32x2); > /* S[ {0,1} ] = {S[ 1 ],0} + > silk_RSHIFT_ROUND( ); */ > *S_s32x2 = vadd_s32(*S_s32x2, vget_high_s32(t_s32x4)); > /* S[ {0,1} ] = silk_SMLAWB( S[ {0,1} ], > B_Q28[ {1,2} ], in[ k ] ); */ > } > > static inline void silk_biquad_alt_stride2_kernel(const int32x4_t > A_Q28_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, > const int32x4_t inval_s32x4, int32x4_t *S_s32x4, int32x2_t > *out32_Q14_s32x2) > { > int32x4_t t_s32x4, out32_Q14_s32x4; > > *out32_Q14_s32x2 = vadd_s32(vget_low_s32(*S_s32x4), t_s32x2); > /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k * 2 + {0,1} ] ) > */ > *S_s32x4 = vcombine_s32(vget_high_s32(*S_s32x4), > vdup_n_s32(0)); /* S{0,1} = S{2,3}; S{2,3} = 0; > */ > *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2); > /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, > B_Q28[ 0 ], in[ k * 2 + {0,1} ] ), 2 ); */ > out32_Q14_s32x4 = vcombine_s32(*out32_Q14_s32x2, > *out32_Q14_s32x2); /* out32_Q14_{0,1,0,1} > */ > t_s32x4 = vqrdmulhq_s32(out32_Q14_s32x4, A_Q28_s32x4); > /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ {0,1,0,1} ] * > (-A_Q28[ {0,0,1,1} ]), 30 ) */ > *S_s32x4 = vaddq_s32(*S_s32x4, t_s32x4); > /* S[ {0,1,2,3} ] = {S[ {2,3} ],0,0} + silk_RSHIFT_ROUND( ); > */ > t_s32x4 = vqdmulhq_s32(inval_s32x4, B_Q28_s32x4); > /* silk_SMULWB(B_Q28[ {1,1,2,2} ], in[ k * 2 + {0,1,0,1} ] ) > */ > *S_s32x4 = vaddq_s32(*S_s32x4, t_s32x4); > /* S[ {0,1,2,3} ] = silk_SMLAWB( S[ {0,1,2,3} ], B_Q28[ > {1,1,2,2} ], in[ k * 2 + {0,1,0,1} ] ); */ > } > > Thanks, > Linfeng > >