thr3ads.net - opus - [opus] [PATCH] 02-Add CELT filter optimizations [May 2013]

If this information is useful, please help other people find it:
Share via:

Aurélien Zanelli

2013-May-21 12:07 UTC

[opus] [PATCH] 02-Add CELT filter optimizations

Please ignore my previous mail and patch, there is a new version :).

Patch changes are:
- Use MAC16_16 macros instead of (sum += a*b) and unroll a loop by 2. It 
increase performance when using optimized macros (ex: ARMv5E). A 
possible side effect of loop unroll is that i don't check for odd length 
here.
- Add NEON version of FIR filter and autocorr
- Add a section in autoconf in order to check NEON support

Best regards,
-- 
Aur?lien Zanelli
Parrot SA
174, quai de Jemmapes
75010 Paris
France
-------------- next part --------------
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index d2addbf..14a7839 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -33,6 +33,10 @@
 #include "stack_alloc.h"
 #include "mathops.h"
 
+#ifdef ARM_HAVE_NEON
+#include "celt_lpc_neon.h"
+#endif
+
 void _celt_lpc(
       opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */
 const opus_val32 *ac,  /* in:  [0...p] autocorrelation values  */
@@ -87,6 +91,7 @@ int          p
 #endif
 }
 
+#ifndef OVERRIDE_CELT_FIR
 void celt_fir(const opus_val16 *x,
          const opus_val16 *num,
          opus_val16 *y,
@@ -101,7 +106,7 @@ void celt_fir(const opus_val16 *x,
       opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum += MULT16_16(num[j],mem[j]);
+         sum = MAC16_16(sum, num[j], mem[j]);
       }
       for (j=ord-1;j>=1;j--)
       {
@@ -111,6 +116,7 @@ void celt_fir(const opus_val16 *x,
       y[i] = ROUND16(sum, SIG_SHIFT);
    }
 }
+#endif
 
 void celt_iir(const opus_val32 *x,
          const opus_val16 *den,
@@ -136,6 +142,7 @@ void celt_iir(const opus_val32 *x,
    }
 }
 
+#ifndef OVERRIDE_CELT_AUTOCORR
 void _celt_autocorr(
                    const opus_val16 *x,   /*  in: [0...n-1] samples x   */
                    opus_val32       *ac,  /* out: [0...lag-1] ac values */
@@ -163,8 +170,12 @@ void _celt_autocorr(
    {
       opus_val32 ac0=0;
       int shift;
-      for(i=0;i<n;i++)
+      int n2 = n>>1;
+      for(i=0;i<n2;i++)
+      {
          ac0 += SHR32(MULT16_16(xx[i],xx[i]),9);
+         ac0 += SHR32(MULT16_16(xx[n2+i],xx[n2+i]),9);
+      }
       ac0 += 1+n;
 
       shift = celt_ilog2(ac0)-30+10;
@@ -176,7 +187,7 @@ void _celt_autocorr(
    while (lag>=0)
    {
       for (i = lag, d = 0; i < n; i++)
-         d += xx[i] * xx[i-lag];
+         d = MAC16_16(d, xx[i], xx[i-lag]);
       ac[lag] = d;
       /*printf ("%f ", ac[lag]);*/
       lag--;
@@ -186,3 +197,4 @@ void _celt_autocorr(
 
    RESTORE_STACK;
 }
+#endif
diff --git a/celt/celt_lpc_neon.h b/celt/celt_lpc_neon.h
new file mode 100644
index 0000000..e9f76c6
--- /dev/null
+++ b/celt/celt_lpc_neon.h
@@ -0,0 +1,485 @@
+/* Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_NEON_H
+#define CELT_LPC_NEON_H
+
+#ifdef FIXED_POINT
+
+#ifndef DISABLE_CELT_FIR_NEON
+#define OVERRIDE_CELT_FIR
+/* Optimized FIR filter for order 1 and 4 which are used by opus encoder
+ * FIR calls in pitch.c are hard-coded with 1 and 4 order values
+ *
+ * TODO: Test one sample by one filtering
+ */
+
+/* Order 1 NEON FIR filter implementation */
+static void celt_fir1(const opus_val16 *x, opus_val16 num, opus_val16 *y,
+    int N, opus_val16 mem)
+{
+  int i;
+
+  __asm__ __volatile__(
+      "vdup.s16 d8, %1;\n" //Duplicate num in d8 lane
+      "vdup.s16 q5, %4;\n" //Duplicate mem in q5 lane
+
+      /* We try to process 16 samples at a time */
+      "movs %5, %3, lsr #4;\n"
+      "beq .celt_fir1_process16_done_%=;\n"
+
+      ".celt_fir1_process16_%=:\n"
+      /* Load 16 x values in q0, q1 lanes */
+      "vld1.16 {q0-q1}, [%0]!;\n"
+
+      /* Init four 32 bits sum in q7, q8, q9, q10 lanes */
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n"
+      "vshll.s16 q8, d1, %[SIGSHIFT];\n"
+      "vshll.s16 q9, d2, %[SIGSHIFT];\n"
+      "vshll.s16 q10, d3, %[SIGSHIFT];\n"
+
+      /* Make previous samples vector for MAC in q5, q6 lanes */
+      "vext.16 q5, q5, q0, #7;\n"
+      "vext.16 q6, q0, q1, #7;\n"
+
+      /* Doing 16 samples filtering at a time */
+      "vmlal.s16 q7, d8, d10;\n"
+      "vmlal.s16 q8, d8, d11;\n"
+      "vmlal.s16 q9, d8, d12;\n"
+      "vmlal.s16 q10, d8, d13;\n"
+      
+      /* Reduce filter sum to 16 bits for y output */
+      "vrshrn.s32 d4, q7, %[SIGSHIFT];\n"
+      "vrshrn.s32 d5, q8, %[SIGSHIFT];\n"
+      "vrshrn.s32 d6, q9, %[SIGSHIFT];\n"
+      "vrshrn.s32 d7, q10, %[SIGSHIFT];\n"
+
+      "pld [%0, #0];\n"
+
+      /* Duplicate last x sample to q5 for next "previous" sample
vector
+       * I know this last sentence is tricky :) */
+      "vdup.s16 q5, d3[3];\n" 
+      
+      /* Store 16 y results */
+      "vst1.16 {q2-q3}, [%2]!;\n"
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir1_process16_%=;\n"
+      ".celt_fir1_process16_done_%=:\n"
+
+      /* Check if some samples remains */
+      "ands %5, %3, #15;\n"
+      "beq .celt_fir1_done_%=;\n"
+
+      /* Process remaining samples one by one with NEON 
+       * Previous sample will be store in d11 top in all case,
+       * so we will store top result of vector operation */
+      ".celt_fir1_process_remaining_%=:\n"
+      "vld1.16 d0[0], [%0]!;\n"           //Load x
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n"  //Initialize sum
+      "vmlal.s16 q7, d8, d11;\n"          //Multiply-accumulate
+      "vrshrn.s32 d4, q7, %[SIGSHIFT];\n" //Scale result
+      "vmov.s16 d11, d0;\n"               //Move previous 
+      "vst1.16 d4[3], [%2]!;\n"           //Store result
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir1_process_remaining_%=;\n"
+
+      ".celt_fir1_done_%=:\n"
+      : "=r"(x), "=r"(num), "=r"(y),
"=r"(N), "=r"(mem), "=r"(i)
+      : "0"(x), "1"(num), "2"(y),
"3"(N), "4"(mem), [SIGSHIFT]"I"(SIG_SHIFT)
+        /* Clobber d0-d21 because some gcc version (4.4.3) don't aliase
q(x) to
+         * d(x), d(x+1) */
+      : "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+      "d9", "d10", "d11", "d12",
"d13", "d14", "d15", "d16",
"d17", "d18",
+      "d19", "d20", "d21"
+      );
+}
+
+/* Order 4 FIR filter with NEON */
+static void celt_fir4(const opus_val16 *x, const opus_val16 *num, opus_val16
*y,
+    int N, opus_val16 *mem)
+{
+  int i;
+  
+  __asm__ __volatile__(
+      "vld1.16 {d4}, [%1];\n" //Load num in d4 lane
+      "vld1.16 {d11}, [%4];\n" //Load provided mem in d11 lane
+
+      /* We try to process 16 samples at a time */
+      "movs %5, %3, lsr #4;\n"
+      "beq .celt_fir4_process16_done_%=;\n"
+
+      /* Reverse provided mem order because we will process in reverse order */
+      "vrev64.16 d11, d11;\n"
+
+      ".celt_fir4_process16_%=:\n"
+      /* Load 16 x values in q0, q1 lanes */
+      "vld1.16 {q0-q1}, [%0]!;\n"
+
+      /* Init four 32 bits sum in q7, q8, q9, q10 lanes */
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n"
+      "vshll.s16 q8, d1, %[SIGSHIFT];\n"
+      "vshll.s16 q9, d2, %[SIGSHIFT];\n"
+      "vshll.s16 q10, d3, %[SIGSHIFT];\n"
+
+      /* Build previous sample vector which will be used in filter
+       * each sample will need the four previous sample.
+       * We use q lanes to store it  */
+      "vext.16 q5, q5, q0, #4;\n"
+      "vext.16 q6, q0, q1, #4;\n"
+
+      /* Doing 16 samples filtering at a time and use reverse order filter
+       * begin to mla last sample and coef */
+      "vmlal.s16 q7, d10, d4[3];\n"
+      "vmlal.s16 q8, d11, d4[3];\n"
+      "vmlal.s16 q9, d12, d4[3];\n"
+      "vmlal.s16 q10, d13, d4[3];\n"
+      
+      /* Prepare samples for n-3 sample processing */
+      "vext.16 q5, q5, q6, #1;\n"
+      "vext.16 d12, d12, d13, #1\n"
+      "vext.16 d22, d13, d3, #1\n" //We use d22 because we need to
access d3 by group
+
+      "vmlal.s16 q7, d10, d4[2];\n"
+      "vmlal.s16 q8, d11, d4[2];\n"
+      "vmlal.s16 q9, d12, d4[2];\n"
+      "vmlal.s16 q10, d22, d4[2];\n"
+      
+      /* Prepare samples for n-2 sample processing */
+      "vext.16 q5, q5, q6, #1;\n"
+      "vext.16 d12, d12, d22, #1\n"
+      "vext.16 d22, d13, d3, #2\n"
+      
+      "vmlal.s16 q7, d10, d4[1];\n"
+      "vmlal.s16 q8, d11, d4[1];\n"
+      "vmlal.s16 q9, d12, d4[1];\n"
+      "vmlal.s16 q10, d22, d4[1];\n"
+      
+      /* Prepare sample for n-1 sample processing */
+      "vext.16 q5, q5, q6, #1;\n"
+      "vext.16 d12, d12, d22, #1\n"
+      "vext.16 d22, d13, d3, #3\n"
+      
+      "vmlal.s16 q7, d10, d4[0];\n"
+      "vmlal.s16 q8, d11, d4[0];\n"
+      "vmlal.s16 q9, d12, d4[0];\n"
+      "vmlal.s16 q10, d22, d4[0];\n"
+      
+      /* Reduce filter sum to 16 bits for y output */
+      "vrshrn.s32 d6, q7, %[SIGSHIFT];\n"
+      "vrshrn.s32 d7, q8, %[SIGSHIFT];\n"
+      "vrshrn.s32 d8, q9, %[SIGSHIFT];\n"
+      "vrshrn.s32 d9, q10, %[SIGSHIFT];\n"
+
+      "pld [%0, #0];\n"
+
+      /* Duplicate last four x sample to d11 for next "previous"
sample vector
+       * I know this last sentence is tricky :) */
+      "vmov.s16 d11, d3;\n" 
+      
+      /* Store 16 y results */
+      "vst1.16 {q3-q4}, [%2]!;\n"
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir4_process16_%=;\n"
+      ".celt_fir4_process16_done_%=:\n"
+
+      /* Check if some samples remains */
+      "ands %5, %3, #15;\n"
+      "beq .celt_fir4_done_%=;\n"
+      
+      /* Process remaining samples one by one with NEON 
+       * Previous sample will be store in d11 top in all case,
+       * so we will store reduce the 4 four top result of vector operation */
+      ".celt_fir4_process_remaining_%=:\n"
+      "vld1.16 d0[0], [%0]!;\n"          //Load x
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n" //Initialize sum
+      "vmull.s16 q8, d4, d11;\n"         //Multiply-accumulate
+      "vadd.s32 d16, d16, d17;\n"        //Three next instructions
reduce the sum
+      "vpadd.s32 d16, d16;\n"
+      "vadd.s16 d14, d14, d16;\n"
+      "vrshrn.s32 d6, q7, %[SIGSHIFT];\n" //Scale result to 16 bits
+      "vmov.s16 d11, d0;\n"               //Move previous 
+      "vst1.16 d6[0], [%2]!;\n"           //Store result
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir4_process_remaining_%=;\n"
+
+      ".celt_fir4_done_%=:\n"
+      : "=r"(x), "=r"(num), "=r"(y),
"=r"(N), "=r"(mem), "=r"(i)
+      : "0"(x), "1"(num), "2"(y),
"3"(N), "4"(mem), [SIGSHIFT]"I"(SIG_SHIFT)
+        /* Clobber d0-d21 because some gcc version (4.4.3) don't aliase
q(x) to
+         * d(x), d(x+1) */
+      : "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+      "d9", "d10", "d11", "d12",
"d13", "d14", "d15", "d16",
"d17", "d18",
+      "d19", "d20", "d21", "d22"
+      );
+}
+
+void celt_fir(const opus_val16 *x, const opus_val16 *num, opus_val16 *y,
+    int N, int ord, opus_val16 *mem)
+{
+  int i,j;
+
+  switch(ord)
+  {
+    case 1:
+      celt_fir1(x, *num, y, N, *mem);
+      break;
+
+    case 4:
+      celt_fir4(x, num, y, N, mem);
+      break;
+    
+    default:
+      for (i=0;i<N;i++)
+      {
+        opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+        for (j=0;j<ord;j++)
+          sum = MAC16_16(sum, num[j],mem[j]);
+        for (j=ord-1;j>=1;j--)
+          mem[j]=mem[j-1];
+        mem[0] = x[i];
+        y[i] = ROUND16(sum, SIG_SHIFT);
+      }
+    break;
+  }
+}
+#endif /* CELT_FIR_NEON */
+
+
+#ifndef DISABLE_CELT_AUTOCORR_NEON
+#define OVERRIDE_CELT_AUTOCORR
+void _celt_autocorr(
+		const opus_val16 *x,   /*  in: [0...n-1] samples x   */
+		opus_val32       *ac,  /* out: [0...lag-1] ac values */
+		const opus_val16       *window,
+		int          overlap,
+		int          lag,
+		int          n
+		)
+{
+	opus_val32 d;
+	int i;
+	VARDECL(opus_val16, xx);
+	SAVE_STACK;
+	ALLOC(xx, n, opus_val16);
+	celt_assert(n>0);
+	celt_assert(overlap>=0);
+	for (i=0;i<n;i++)
+		xx[i] = x[i];
+	
+	{
+		opus_val16 * xxbeg = xx;
+		opus_val16 * xxend = xx+n-1;
+		const opus_val16 * xbeg = x;
+		const opus_val16 * xend = x+n-1;
+		int scratch0, scratch1, scratch2, scratch3, scratch4;	
+		__asm__ __volatile__(
+				"movs %6, %5, lsr #3;\n"
+				"beq .celt_autocorr_process8_done_%=;\n"
+
+				/* Process 8 samples at a time */
+				".celt_autocorr_process8_%=:\n"
+				"subs %3, %3, #16;\n"
+				"subs %1, %1, #16;\n"
+				"vld1.16 {q2}, [%4]!;\n" //Load 8 window values
+				"vld1.16 {q0}, [%2]!;\n" //Load 8 x values from beg
+				"vld1.16 {q1}, [%3];\n"  //Load 8 x values from end
+
+				/* MULT16_16_Q15(x[i],window[i]) */
+				"vmull.s16 q3, d0, d4;\n"
+				"vmull.s16 q4, d1, d5;\n"
+
+				"pld [%4, #0];\n"
+				
+				/* MULT16_16_Q15(x[n-i-1],window[i]) */
+				"vmull.s16 q5, d2, d4;\n"
+				"vmull.s16 q6, d3, d5;\n"
+
+				"pld [%2, #0];\n"
+				
+				/* Shift right by 15 */
+				"vshrn.s32 d0, q3, #15;\n"
+				"vshrn.s32 d1, q4, #15;\n"
+				"vshrn.s32 d2, q5, #15;\n"
+				"vshrn.s32 d3, q6, #15;\n"
+
+				"pld [%3, #-16];\n"
+
+				"vst1.16 {q0}, [%0]!;\n"
+				"vst1.16 {q1}, [%1];\n"
+
+				"subs %6, %6, #1;\n"
+				"bne .celt_autocorr_process8_%=;\n"
+				".celt_autocorr_process8_done_%=:\n"
+
+				"ands %6, %5, #7;\n"
+				"beq .celt_autocorr_done_%=;\n"
+
+				/* Process remaining sample */
+				".celt_autocorr_process_remaining_%=:\n"
+				"subs %3, %3, #2;\n"
+				"subs %1, %1, #2;\n"
+				"vld1.16 d4[0], [%4]!;\n" //Load 1 window value
+				"vld1.16 d0[0], [%2]!;\n" //Load 1 x value from beg
+				"vld1.16 d0[1], [%3];\n"  //Load 1 x value from end
+
+				"vmull.s16 q3, d0, d4[0];\n"
+				"vshrn.s32 d0, q3, #15;\n"
+
+				"vst1.16 d0[0], [%0]!;\n"
+				"vst1.16 d0[1], [%1];\n"
+
+				"subs %6, %6, #1;\n"
+				"bne .celt_autocorr_process_remaining_%=;\n"
+				".celt_autocorr_done_%=:\n"
+				: "=r"(scratch0), "=r"(scratch1),
"=r"(scratch2), "=r"(scratch3),
+				"=r"(scratch4), "=r"(overlap), "=r"(i)
+				: "0"(xxbeg), "1"(xxend), "2"(xbeg),
"3"(xend), "4"(window), "5"(overlap)
+        /* Clobber d0-d13 because some gcc version (4.4.3) don't aliase
q(x) to
+         * d(x), d(x+1) */
+				: "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+				"d9", "d10", "d11", "d12",
"d13"
+				);
+	}
+	
+
+	{
+		opus_val32 ac0;
+		int shift;
+		int scratch1;
+		__asm__ __volatile__(
+				"veor.s32 q0, q0, q0;\n"
+				"movs %3, %2, lsr #3;\n"
+				"beq .celt_autocorr_process8_done_%=;\n"
+				
+				/* Process 8 samples at a time */
+				".celt_autocorr_process8_%=:\n"
+				"vld1.16 {q1}, [%1]!\n"   //Load 8 xx values
+				"subs %3, %3, #1;\n"
+				"vmull.s16 q2, d2, d2;\n" //MULT16_16(xx[i], xx[i]
+				"vmull.s16 q3, d3, d3;\n" //MULT16_16(xx[i], xx[i]
+				"pld [%1, #0];\n"
+				"vsra.s32 q0, q2, #9;\n"  //Shift right by 9 and accumulate to
ac0
+				"vsra.s32 q0, q3, #9;\n"  //Shift right by 9 and accumulate to
ac0
+				"bne .celt_autocorr_process8_%=;\n"
+
+				".celt_autocorr_process8_done_%=:\n"
+				"ands %3, %2, #7;\n"
+				"beq .celt_autocorr_process_remaining_done_%=;\n"
+
+				/* Process remaining samples */
+				"veor.s16 q1, q1, q1;\n"  //Clear q1 to not accumulate bad values
+				".celt_autocorr_process_remaining_%=:\n"
+				"vld1.16 d2[0], [%1]!;\n" //Load 1 xx values
+				"subs %3, %3, #1;\n"
+				"vmull.s16 q2, d2, d2;\n" //MULT16_16(xx[i], xx[i]
+				"vsra.s32 q0, q2, #9;\n"  //Shift right by 9 and accumulate to
ac0
+				"bne .celt_autocorr_process_remaining_%=;\n"
+				".celt_autocorr_process_remaining_done_%=:\n"
+			
+				/* Reduce sum and move result to ARM register */
+				"vadd.s32 d0, d0, d1;\n"
+				"vpadd.s32 d0, d0;\n"
+				"vmov.s32 %0, d0[0];\n"
+				: "=r"(ac0), "=r"(scratch1), "=r"(n),
"=r"(i)
+				: "1"(xx), "2"(n)
+        /* Clobber d0-d7 because some gcc version (4.4.3) don't aliase q(x)
to
+         * d(x), d(x+1) */
+				: "cc", "d0", "d1", "d2",
"d3", "d4", "d5", "d6", "d7"
+				);
+		ac0 += 1+n;
+
+		shift = celt_ilog2(ac0)-30+10;
+		shift = (shift+1)/2;
+		for(i=0;i<n;i++)
+			xx[i] = VSHR32(xx[i], shift);
+	}
+
+	while (lag>=0)
+	{
+		opus_val16 * xx1 = xx+lag;
+		opus_val16 * xx2 = xx;
+		int scratch4, scratch5;
+
+		__asm__ __volatile__(
+				"veor.s32 q0, q0;\n"     //Clear sum, q0 will contain 4 sum
+				"movs %3, %4, lsr #4;\n" //(n-lag)/16
+				"beq .celt_autocorr_process16_done_%=;\n"
+
+				/* Process 16 samples at a time */
+				".celt_autocorr_process16_%=:\n"
+				"vld1.16 {q1-q2}, [%1]!;\n" //Load 16 xx values from xx+lag=xx[i]
+				"vld1.16 {q3-q4}, [%2]!;\n" //Load 16 xx values from xx=xx[i-lag]
+				"vmlal.s16 q0, d2, d6;\n"   //MAC16_16(d, xx[i], xx[i-lag])
+				"vmlal.s16 q0, d3, d7;\n"   //MAC16_16(d, xx[i], xx[i-lag]);
+				"pld [%1, #0];\n"
+				"pld [%2, #0];\n"
+				"vmlal.s16 q0, d4, d8;\n"   //MAC16_16(d, xx[i], xx[i-lag]);
+				"vmlal.s16 q0, d5, d9;\n"   //MAC16_16(d, xx[i], xx[i-lag]);
+				"pld [%1, #16];\n"
+				"pld [%2, #16];\n"
+				"subs %3, %3, #1;\n"
+				"bne .celt_autocorr_process16_%=;\n"
+				".celt_autocorr_process16_done_%=:\n"
+
+				"ands %3, %4, #15;\n"    //(n-lag)&15
+				"beq .celt_autocorr_process_remaining_done_%=;\n"
+				"veor.s32 q1, q1, q1;\n" //Clear q1
+
+				/* Process remaining samples one by one */
+				".celt_autocorr_process_remaining_%=:\n"
+				"vld1.16 d2[0], [%1]!;\n" //Load 1 xx value from xx+lag=xx[i]
+				"vld1.16 d3[0], [%2]!;\n" //Load 1 xx value from xx=xx[i-lag]
+				"subs %3, %3, #1;\n"
+				"vmlal.s16 q0, d2, d3;\n" //MAC16_16(d, xx[i], xx[i-lag])
+				"bne .celt_autocorr_process_remaining_%=;\n"
+				".celt_autocorr_process_remaining_done_%=:\n"
+
+				/* Reduce sum and store it */
+				"vadd.s32 d0, d0, d1;\n"
+				"vpadd.s32 d0, d0;\n"
+				"vst1.32 d0[0], [%5];\n"
+				: "=&r"(d), "=r"(xx1), "=r"(xx2),
"=r"(i), "=r"(scratch4), "=r"(scratch5)
+				: "0"(0), "1"(xx1), "2"(xx2),
"4"(n-lag), "5"(ac+lag)
+        /* Clobber d0-d9 because some gcc version (4.4.3) don't aliase q(x)
to
+         * d(x), d(x+1) */
+				: "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+				"d9"
+				);
+		lag--;
+	}
+	ac[0] += 10;
+
+	RESTORE_STACK;
+}
+#endif /* CELT_AUTOCORR_NEON */
+
+#endif /* FIXED_POINT */
+
+#endif
diff --git a/configure.ac b/configure.ac
index 0c6d725..a36d403 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,6 +178,11 @@ if test "x${ac_enable_asm}" = xyes ; then
                 AC_DEFINE(ARMv6_ASM, 1, [Use ARMv6 asm optimizations])
                 asm_optimization="${asm_optimization} (Media)"
             fi
+            AS_ASM_ARM_NEON([ARM_HAVE_NEON=1],[ARM_HAVE_NEON=0])
+            if test "x${ARM_HAVE_NEON}" = "x1" ; then
+              AC_DEFINE([ARM_HAVE_NEON], 1, [Use ARM NEON optimizations])
+              asm_optimization="${asm_optimization} (NEON)"
+            fi
         fi
         ;;
     esac

Timothy B. Terriberry

2013-May-21 20:54 UTC

head link

[opus] [PATCH] 02-Add CELT filter optimizations

Aur?lien Zanelli wrote:> Please ignore my previous mail and patch, there is a new version :).
>
> Patch changes are:
> - Use MAC16_16 macros instead of (sum += a*b) and unroll a loop by 2. It
> increase performance when using optimized macros (ex: ARMv5E). A
> possible side effect of loop unroll is that i don't check for odd
length
> here.
> - Add NEON version of FIR filter and autocorr
> - Add a section in autoconf in order to check NEON support
As Peter Robinson pointed out, we need runtime CPU detection for NEON. 
Even if we know at compile time that we're targeting ARMv7, some chips 
have NEON and some don't, and Debian, Android apps, Firefox, etc., all 
need a single build that runs on both.

We did some design discussion in #opus this morning. The short-term plan 
is to port over the libtheora ARM CPU detection code. Instead of having 
function tables in the state structs, however, the plan is to use an 
index into a read-only list of functions, so e.g., 
ptr_funcs[st->arch&ARCH_MASK] can select one without the risk of buffer 
overflows corrupting st leading to arbitrary code execution.

If you want to start implementing that, let me know, otherwise I'll take 
a crack at it.

Also, when replacing whole functions, I think we should use separate 
RVCT-syntax assembly files instead of inline asm, for portability. We 
can translate to gas-syntax with a simple Perl script (libtheora and 
libvpx use this strategy).

Timothy B. Terriberry

2013-May-22 22:48 UTC

head link

[opus] [PATCH] 02-Add CELT filter optimizations

Aur?lien Zanelli wrote:> Patch changes are:
> - Use MAC16_16 macros instead of (sum += a*b) and unroll a loop by 2. It
> increase performance when using optimized macros (ex: ARMv5E). A
> possible side effect of loop unroll is that i don't check for odd
length
> here.
> - Add NEON version of FIR filter and autocorr
> - Add a section in autoconf in order to check NEON support
I split out the arch-independent changes and committed them as
<https://git.xiph.org/?p=opus.git;h=85ede2c6aa06;a=commitdiff>.

Reasonably Related Threads

Search for more apparently analagous threads

opus - May 2013 - [PATCH] 02-Add CELT filter optimizations

[opus] [PATCH] 02-Add CELT filter optimizations

[opus] [PATCH] 02-Add CELT filter optimizations

[opus] [PATCH] 02-Add CELT filter optimizations

Reasonably Related Threads