search for: vdup_n_s16

Displaying 7 results from an estimated 7 matches for "vdup_n_s16".

2013 Jun 07
2
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
...#include <arm_neon.h> static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) { int j; int32x4_t xsum1 = vld1q_s32(sum); int32x4_t xsum2 = vdupq_n_s32(0); for (j = 0; j < len-1; j += 2) { xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x++),vld1_s16(y++)); xsum2 = vmlal_s16(xsum2,vdup_n_s16(*x++),vld1_s16(y++)); } if (j < len) { xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x),vld1_s16(y)); } vst1q_s32(sum,vaddq_s32(xsum1,xsum2)); } Cheers, John Ridges
2013 Jun 07
0
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
...ine void xcorr_kernel(const opus_val16 *x, const opus_val16 > *y, opus_val32 sum[4], int len) > { > int j; > int32x4_t xsum1 = vld1q_s32(sum); > int32x4_t xsum2 = vdupq_n_s32(0); > > for (j = 0; j < len-1; j += 2) { > xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x++),vld1_s16(y++)); > xsum2 = vmlal_s16(xsum2,vdup_n_s16(*x++),vld1_s16(y++)); > } > if (j < len) { > xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x),vld1_s16(y)); > } > vst1q_s32(sum,vaddq_s32(xsum1,xsum2)); > } > > > Cheers, > J...
2016 Aug 23
0
[PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
...10 ) ) ); + tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) ); + r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 ); + + /* Find two quantization level candidates and measure their rate-distortion */ + { + int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) ); + int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 ); + int16x4_t q2_Q10_s16x4; + int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4; + uint32x4_t t_u32x4; + + if( Lambda_Q10 > 2048 ) { + /* For aggressive RDO, th...
2016 Aug 23
2
[PATCH 7/8] Update NSQ_LPC_BUF_LENGTH macro.
NSQ_LPC_BUF_LENGTH is independent of DECISION_DELAY. --- silk/define.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/silk/define.h b/silk/define.h index 781cfdc..1286048 100644 --- a/silk/define.h +++ b/silk/define.h @@ -173,11 +173,7 @@ extern "C" #define MAX_MATRIX_SIZE MAX_LPC_ORDER /* Max of LPC Order and LTP order */ -#if( MAX_LPC_ORDER >
2013 Jun 07
1
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
...6(xsum1,vdup_lane_s16(x0,0),y0); xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,1),vext_s16(y0,y4,1)); xsum1 = vmlal_s16(xsum1,vdup_lane_s16(x0,2),vext_s16(y0,y4,2)); xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,3),y3); } if (j < len) { xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j)); if (++j < len) { xsum2 = vmlal_s16(xsum2,vdup_n_s16(*(x+j)),vld1_s16(y+j)); if (++j < len) { xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j)); } } } vst1q_s32(sum,vaddq_s32(xsu...
2013 Jun 07
2
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
Hi JM, I have no doubt that Mr. Zanelli's NEON code is faster, since hand tuned assembly is bound to be faster than using intrinsics. However I notice that his code can also read past the y buffer. Cheers, --John On 6/6/2013 9:22 PM, Jean-Marc Valin wrote: > Hi John, > > Thanks for the two fixes. They're in git now. Your SSE version seems to > also be slightly faster than
2013 Jun 10
0
opus Digest, Vol 53, Issue 2
...6(xsum1,vdup_lane_s16(x0,0),y0); xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,1),vext_s16(y0,y4,1)); xsum1 = vmlal_s16(xsum1,vdup_lane_s16(x0,2),vext_s16(y0,y4,2)); xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,3),y3); } if (j < len) { xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j)); if (++j < len) { xsum2 = vmlal_s16(xsum2,vdup_n_s16(*(x+j)),vld1_s16(y+j)); if (++j < len) { xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j)); } } } vst1q_s32(sum,vaddq_s32(xsu...