Displaying 7 results from an estimated 7 matches for "vdup_n_s16".
2013 Jun 07
2
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
...#include <arm_neon.h>
static inline void xcorr_kernel(const opus_val16 *x, const opus_val16
*y, opus_val32 sum[4], int len)
{
int j;
int32x4_t xsum1 = vld1q_s32(sum);
int32x4_t xsum2 = vdupq_n_s32(0);
for (j = 0; j < len-1; j += 2) {
xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x++),vld1_s16(y++));
xsum2 = vmlal_s16(xsum2,vdup_n_s16(*x++),vld1_s16(y++));
}
if (j < len) {
xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x),vld1_s16(y));
}
vst1q_s32(sum,vaddq_s32(xsum1,xsum2));
}
Cheers,
John Ridges
2013 Jun 07
0
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
...ine void xcorr_kernel(const opus_val16 *x, const opus_val16
> *y, opus_val32 sum[4], int len)
> {
> int j;
> int32x4_t xsum1 = vld1q_s32(sum);
> int32x4_t xsum2 = vdupq_n_s32(0);
>
> for (j = 0; j < len-1; j += 2) {
> xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x++),vld1_s16(y++));
> xsum2 = vmlal_s16(xsum2,vdup_n_s16(*x++),vld1_s16(y++));
> }
> if (j < len) {
> xsum1 = vmlal_s16(xsum1,vdup_n_s16(*x),vld1_s16(y));
> }
> vst1q_s32(sum,vaddq_s32(xsum1,xsum2));
> }
>
>
> Cheers,
> J...
2016 Aug 23
0
[PATCH 8/8] Optimize silk_NSQ_del_dec() for ARM NEON
...10 ) ) );
+ tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
+ r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
+
+ /* Find two quantization level candidates and measure their rate-distortion */
+ {
+ int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
+ int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
+ int16x4_t q2_Q10_s16x4;
+ int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
+ uint32x4_t t_u32x4;
+
+ if( Lambda_Q10 > 2048 ) {
+ /* For aggressive RDO, th...
2016 Aug 23
2
[PATCH 7/8] Update NSQ_LPC_BUF_LENGTH macro.
NSQ_LPC_BUF_LENGTH is independent of DECISION_DELAY.
---
silk/define.h | 4 ----
1 file changed, 4 deletions(-)
diff --git a/silk/define.h b/silk/define.h
index 781cfdc..1286048 100644
--- a/silk/define.h
+++ b/silk/define.h
@@ -173,11 +173,7 @@ extern "C"
#define MAX_MATRIX_SIZE MAX_LPC_ORDER /* Max of LPC Order and LTP order */
-#if( MAX_LPC_ORDER >
2013 Jun 07
1
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
...6(xsum1,vdup_lane_s16(x0,0),y0);
xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,1),vext_s16(y0,y4,1));
xsum1 = vmlal_s16(xsum1,vdup_lane_s16(x0,2),vext_s16(y0,y4,2));
xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,3),y3);
}
if (j < len) {
xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j));
if (++j < len) {
xsum2 = vmlal_s16(xsum2,vdup_n_s16(*(x+j)),vld1_s16(y+j));
if (++j < len) {
xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j));
}
}
}
vst1q_s32(sum,vaddq_s32(xsu...
2013 Jun 07
2
Bug fix in celt_lpc.c and some xcorr_kernel optimizations
Hi JM,
I have no doubt that Mr. Zanelli's NEON code is faster, since hand tuned
assembly is bound to be faster than using intrinsics. However I notice
that his code can also read past the y buffer.
Cheers,
--John
On 6/6/2013 9:22 PM, Jean-Marc Valin wrote:
> Hi John,
>
> Thanks for the two fixes. They're in git now. Your SSE version seems to
> also be slightly faster than
2013 Jun 10
0
opus Digest, Vol 53, Issue 2
...6(xsum1,vdup_lane_s16(x0,0),y0);
xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,1),vext_s16(y0,y4,1));
xsum1 = vmlal_s16(xsum1,vdup_lane_s16(x0,2),vext_s16(y0,y4,2));
xsum2 = vmlal_s16(xsum2,vdup_lane_s16(x0,3),y3);
}
if (j < len) {
xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j));
if (++j < len) {
xsum2 = vmlal_s16(xsum2,vdup_n_s16(*(x+j)),vld1_s16(y+j));
if (++j < len) {
xsum1 = vmlal_s16(xsum1,vdup_n_s16(*(x+j)),vld1_s16(y+j));
}
}
}
vst1q_s32(sum,vaddq_s32(xsu...