- Use MAC16_16 macros instead of (sum += a*b) and unroll a loop by 2. It
increase performance when using optimized macros (ex: ARMv5E). A
possible side effect of loop unroll is that i don't check for odd length
here.
- Add NEON version of FIR filter and autocorr
--
Aur?lien Zanelli
Parrot SA
174, quai de Jemmapes
75010 Paris
France
-------------- next part --------------
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index d2addbf..14a7839 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -33,6 +33,10 @@
#include "stack_alloc.h"
#include "mathops.h"
+#ifdef ARM_HAVE_NEON
+#include "celt_lpc_neon.h"
+#endif
+
void _celt_lpc(
opus_val16 *_lpc, /* out: [0...p-1] LPC coefficients */
const opus_val32 *ac, /* in: [0...p] autocorrelation values */
@@ -87,6 +91,7 @@ int p
#endif
}
+#ifndef OVERRIDE_CELT_FIR
void celt_fir(const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
@@ -101,7 +106,7 @@ void celt_fir(const opus_val16 *x,
opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
for (j=0;j<ord;j++)
{
- sum += MULT16_16(num[j],mem[j]);
+ sum = MAC16_16(sum, num[j], mem[j]);
}
for (j=ord-1;j>=1;j--)
{
@@ -111,6 +116,7 @@ void celt_fir(const opus_val16 *x,
y[i] = ROUND16(sum, SIG_SHIFT);
}
}
+#endif
void celt_iir(const opus_val32 *x,
const opus_val16 *den,
@@ -136,6 +142,7 @@ void celt_iir(const opus_val32 *x,
}
}
+#ifndef OVERRIDE_CELT_AUTOCORR
void _celt_autocorr(
const opus_val16 *x, /* in: [0...n-1] samples x */
opus_val32 *ac, /* out: [0...lag-1] ac values */
@@ -163,8 +170,12 @@ void _celt_autocorr(
{
opus_val32 ac0=0;
int shift;
- for(i=0;i<n;i++)
+ int n2 = n>>1;
+ for(i=0;i<n2;i++)
+ {
ac0 += SHR32(MULT16_16(xx[i],xx[i]),9);
+ ac0 += SHR32(MULT16_16(xx[n2+i],xx[n2+i]),9);
+ }
ac0 += 1+n;
shift = celt_ilog2(ac0)-30+10;
@@ -176,7 +187,7 @@ void _celt_autocorr(
while (lag>=0)
{
for (i = lag, d = 0; i < n; i++)
- d += xx[i] * xx[i-lag];
+ d = MAC16_16(d, xx[i], xx[i-lag]);
ac[lag] = d;
/*printf ("%f ", ac[lag]);*/
lag--;
@@ -186,3 +197,4 @@ void _celt_autocorr(
RESTORE_STACK;
}
+#endif
diff --git a/celt/celt_lpc_neon.h b/celt/celt_lpc_neon.h
new file mode 100644
index 0000000..72de8e0
--- /dev/null
+++ b/celt/celt_lpc_neon.h
@@ -0,0 +1,485 @@
+/* Copyright (c) 2013 Parrot */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_NEON_H
+#define CELT_LPC_NEON_H
+
+#ifdef FIXED_POINT
+
+#ifdef CELT_FIR_NEON
+#define OVERRIDE_CELT_FIR
+/* Optimized FIR filter for order 1 and 4 which are used by opus encoder
+ * FIR calls in pitch.c are hard-coded with 1 and 4 order values
+ *
+ * TODO: Test one sample by one filtering
+ */
+
+/* Order 1 NEON FIR filter implementation */
+static void celt_fir1(const opus_val16 *x, opus_val16 num, opus_val16 *y,
+ int N, opus_val16 mem)
+{
+ int i;
+
+ __asm__ __volatile__(
+ "vdup.s16 d8, %1;\n" //Duplicate num in d8 lane
+ "vdup.s16 q5, %4;\n" //Duplicate mem in q5 lane
+
+ /* We try to process 16 samples at a time */
+ "movs %5, %3, lsr #4;\n"
+ "beq .celt_fir1_process16_done_%=;\n"
+
+ ".celt_fir1_process16_%=:\n"
+ /* Load 16 x values in q0, q1 lanes */
+ "vld1.16 {q0-q1}, [%0]!;\n"
+
+ /* Init four 32 bits sum in q7, q8, q9, q10 lanes */
+ "vshll.s16 q7, d0, %[SIGSHIFT];\n"
+ "vshll.s16 q8, d1, %[SIGSHIFT];\n"
+ "vshll.s16 q9, d2, %[SIGSHIFT];\n"
+ "vshll.s16 q10, d3, %[SIGSHIFT];\n"
+
+ /* Make previous samples vector for MAC in q5, q6 lanes */
+ "vext.16 q5, q5, q0, #7;\n"
+ "vext.16 q6, q0, q1, #7;\n"
+
+ /* Doing 16 samples filtering at a time */
+ "vmlal.s16 q7, d8, d10;\n"
+ "vmlal.s16 q8, d8, d11;\n"
+ "vmlal.s16 q9, d8, d12;\n"
+ "vmlal.s16 q10, d8, d13;\n"
+
+ /* Reduce filter sum to 16 bits for y output */
+ "vrshrn.s32 d4, q7, %[SIGSHIFT];\n"
+ "vrshrn.s32 d5, q8, %[SIGSHIFT];\n"
+ "vrshrn.s32 d6, q9, %[SIGSHIFT];\n"
+ "vrshrn.s32 d7, q10, %[SIGSHIFT];\n"
+
+ "pld [%0, #0];\n"
+
+ /* Duplicate last x sample to q5 for next "previous" sample
vector
+ * I know this last sentence is tricky :) */
+ "vdup.s16 q5, d3[3];\n"
+
+ /* Store 16 y results */
+ "vst1.16 {q2-q3}, [%2]!;\n"
+
+ "subs %5, %5, #1;\n"
+ "bne .celt_fir1_process16_%=;\n"
+ ".celt_fir1_process16_done_%=:\n"
+
+ /* Check if some samples remains */
+ "ands %5, %3, #15;\n"
+ "beq .celt_fir1_done_%=;\n"
+
+ /* Process remaining samples one by one with NEON
+ * Previous sample will be store in d11 top in all case,
+ * so we will store top result of vector operation */
+ ".celt_fir1_process_remaining_%=:\n"
+ "vld1.16 d0[0], [%0]!;\n" //Load x
+ "vshll.s16 q7, d0, %[SIGSHIFT];\n" //Initialize sum
+ "vmlal.s16 q7, d8, d11;\n" //Multiply-accumulate
+ "vrshrn.s32 d4, q7, %[SIGSHIFT];\n" //Scale result
+ "vmov.s16 d11, d0;\n" //Move previous
+ "vst1.16 d4[3], [%2]!;\n" //Store result
+
+ "subs %5, %5, #1;\n"
+ "bne .celt_fir1_process_remaining_%=;\n"
+
+ ".celt_fir1_done_%=:\n"
+ : "=r"(x), "=r"(num), "=r"(y),
"=r"(N), "=r"(mem), "=r"(i)
+ : "0"(x), "1"(num), "2"(y),
"3"(N), "4"(mem), [SIGSHIFT]"I"(SIG_SHIFT)
+ /* Clobber d0-d21 because some gcc version (4.4.3) don't aliase
q(x) to
+ * d(x), d(x+1) */
+ : "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+ "d9", "d10", "d11", "d12",
"d13", "d14", "d15", "d16",
"d17", "d18",
+ "d19", "d20", "d21"
+ );
+}
+
+/* Order 4 FIR filter with NEON */
+static void celt_fir4(const opus_val16 *x, const opus_val16 *num, opus_val16
*y,
+ int N, opus_val16 *mem)
+{
+ int i;
+
+ __asm__ __volatile__(
+ "vld1.16 {d4}, [%1];\n" //Load num in d4 lane
+ "vld1.16 {d11}, [%4];\n" //Load provided mem in d11 lane
+
+ /* We try to process 16 samples at a time */
+ "movs %5, %3, lsr #4;\n"
+ "beq .celt_fir4_process16_done_%=;\n"
+
+ /* Reverse provided mem order because we will process in reverse order */
+ "vrev64.16 d11, d11;\n"
+
+ ".celt_fir4_process16_%=:\n"
+ /* Load 16 x values in q0, q1 lanes */
+ "vld1.16 {q0-q1}, [%0]!;\n"
+
+ /* Init four 32 bits sum in q7, q8, q9, q10 lanes */
+ "vshll.s16 q7, d0, %[SIGSHIFT];\n"
+ "vshll.s16 q8, d1, %[SIGSHIFT];\n"
+ "vshll.s16 q9, d2, %[SIGSHIFT];\n"
+ "vshll.s16 q10, d3, %[SIGSHIFT];\n"
+
+ /* Build previous sample vector which will be used in filter
+ * each sample will need the four previous sample.
+ * We use q lanes to store it */
+ "vext.16 q5, q5, q0, #4;\n"
+ "vext.16 q6, q0, q1, #4;\n"
+
+ /* Doing 16 samples filtering at a time and use reverse order filter
+ * begin to mla last sample and coef */
+ "vmlal.s16 q7, d10, d4[3];\n"
+ "vmlal.s16 q8, d11, d4[3];\n"
+ "vmlal.s16 q9, d12, d4[3];\n"
+ "vmlal.s16 q10, d13, d4[3];\n"
+
+ /* Prepare samples for n-3 sample processing */
+ "vext.16 q5, q5, q6, #1;\n"
+ "vext.16 d12, d12, d13, #1\n"
+ "vext.16 d22, d13, d3, #1\n" //We use d22 because we need to
access d3 by group
+
+ "vmlal.s16 q7, d10, d4[2];\n"
+ "vmlal.s16 q8, d11, d4[2];\n"
+ "vmlal.s16 q9, d12, d4[2];\n"
+ "vmlal.s16 q10, d22, d4[2];\n"
+
+ /* Prepare samples for n-2 sample processing */
+ "vext.16 q5, q5, q6, #1;\n"
+ "vext.16 d12, d12, d22, #1\n"
+ "vext.16 d22, d13, d3, #2\n"
+
+ "vmlal.s16 q7, d10, d4[1];\n"
+ "vmlal.s16 q8, d11, d4[1];\n"
+ "vmlal.s16 q9, d12, d4[1];\n"
+ "vmlal.s16 q10, d22, d4[1];\n"
+
+ /* Prepare sample for n-1 sample processing */
+ "vext.16 q5, q5, q6, #1;\n"
+ "vext.16 d12, d12, d22, #1\n"
+ "vext.16 d22, d13, d3, #3\n"
+
+ "vmlal.s16 q7, d10, d4[0];\n"
+ "vmlal.s16 q8, d11, d4[0];\n"
+ "vmlal.s16 q9, d12, d4[0];\n"
+ "vmlal.s16 q10, d22, d4[0];\n"
+
+ /* Reduce filter sum to 16 bits for y output */
+ "vrshrn.s32 d6, q7, %[SIGSHIFT];\n"
+ "vrshrn.s32 d7, q8, %[SIGSHIFT];\n"
+ "vrshrn.s32 d8, q9, %[SIGSHIFT];\n"
+ "vrshrn.s32 d9, q10, %[SIGSHIFT];\n"
+
+ "pld [%0, #0];\n"
+
+ /* Duplicate last four x sample to d11 for next "previous"
sample vector
+ * I know this last sentence is tricky :) */
+ "vmov.s16 d11, d3;\n"
+
+ /* Store 16 y results */
+ "vst1.16 {q3-q4}, [%2]!;\n"
+
+ "subs %5, %5, #1;\n"
+ "bne .celt_fir4_process16_%=;\n"
+ ".celt_fir4_process16_done_%=:\n"
+
+ /* Check if some samples remains */
+ "ands %5, %3, #15;\n"
+ "beq .celt_fir4_done_%=;\n"
+
+ /* Process remaining samples one by one with NEON
+ * Previous sample will be store in d11 top in all case,
+ * so we will store reduce the 4 four top result of vector operation */
+ ".celt_fir4_process_remaining_%=:\n"
+ "vld1.16 d0[0], [%0]!;\n" //Load x
+ "vshll.s16 q7, d0, %[SIGSHIFT];\n" //Initialize sum
+ "vmull.s16 q8, d4, d11;\n" //Multiply-accumulate
+ "vadd.s32 d16, d16, d17;\n" //Three next instructions
reduce the sum
+ "vpadd.s32 d16, d16;\n"
+ "vadd.s16 d14, d14, d16;\n"
+ "vrshrn.s32 d6, q7, %[SIGSHIFT];\n" //Scale result to 16 bits
+ "vmov.s16 d11, d0;\n" //Move previous
+ "vst1.16 d6[0], [%2]!;\n" //Store result
+
+ "subs %5, %5, #1;\n"
+ "bne .celt_fir4_process_remaining_%=;\n"
+
+ ".celt_fir4_done_%=:\n"
+ : "=r"(x), "=r"(num), "=r"(y),
"=r"(N), "=r"(mem), "=r"(i)
+ : "0"(x), "1"(num), "2"(y),
"3"(N), "4"(mem), [SIGSHIFT]"I"(SIG_SHIFT)
+ /* Clobber d0-d21 because some gcc version (4.4.3) don't aliase
q(x) to
+ * d(x), d(x+1) */
+ : "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+ "d9", "d10", "d11", "d12",
"d13", "d14", "d15", "d16",
"d17", "d18",
+ "d19", "d20", "d21", "d22"
+ );
+}
+
+void celt_fir(const opus_val16 *x, const opus_val16 *num, opus_val16 *y,
+ int N, int ord, opus_val16 *mem)
+{
+ int i,j;
+
+ switch(ord)
+ {
+ case 1:
+ celt_fir1(x, *num, y, N, *mem);
+ break;
+
+ case 4:
+ celt_fir4(x, num, y, N, mem);
+ break;
+
+ default:
+ for (i=0;i<N;i++)
+ {
+ opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+ for (j=0;j<ord;j++)
+ sum = MAC16_16(sum, num[j],mem[j]);
+ for (j=ord-1;j>=1;j--)
+ mem[j]=mem[j-1];
+ mem[0] = x[i];
+ y[i] = ROUND16(sum, SIG_SHIFT);
+ }
+ break;
+ }
+}
+#endif /* CELT_FIR_NEON */
+
+
+#ifdef CELT_AUTOCORR_NEON
+#define OVERRIDE_CELT_AUTOCORR
+void _celt_autocorr(
+ const opus_val16 *x, /* in: [0...n-1] samples x */
+ opus_val32 *ac, /* out: [0...lag-1] ac values */
+ const opus_val16 *window,
+ int overlap,
+ int lag,
+ int n
+ )
+{
+ opus_val32 d;
+ int i;
+ VARDECL(opus_val16, xx);
+ SAVE_STACK;
+ ALLOC(xx, n, opus_val16);
+ celt_assert(n>0);
+ celt_assert(overlap>=0);
+ for (i=0;i<n;i++)
+ xx[i] = x[i];
+
+ {
+ opus_val16 * xxbeg = xx;
+ opus_val16 * xxend = xx+n-1;
+ const opus_val16 * xbeg = x;
+ const opus_val16 * xend = x+n-1;
+ int scratch0, scratch1, scratch2, scratch3, scratch4;
+ __asm__ __volatile__(
+ "movs %6, %5, lsr #3;\n"
+ "beq .celt_autocorr_process8_done_%=;\n"
+
+ /* Process 8 samples at a time */
+ ".celt_autocorr_process8_%=:\n"
+ "subs %3, %3, #16;\n"
+ "subs %1, %1, #16;\n"
+ "vld1.16 {q2}, [%4]!;\n" //Load 8 window values
+ "vld1.16 {q0}, [%2]!;\n" //Load 8 x values from beg
+ "vld1.16 {q1}, [%3];\n" //Load 8 x values from end
+
+ /* MULT16_16_Q15(x[i],window[i]) */
+ "vmull.s16 q3, d0, d4;\n"
+ "vmull.s16 q4, d1, d5;\n"
+
+ "pld [%4, #0];\n"
+
+ /* MULT16_16_Q15(x[n-i-1],window[i]) */
+ "vmull.s16 q5, d2, d4;\n"
+ "vmull.s16 q6, d3, d5;\n"
+
+ "pld [%2, #0];\n"
+
+ /* Shift right by 15 */
+ "vshrn.s32 d0, q3, #15;\n"
+ "vshrn.s32 d1, q4, #15;\n"
+ "vshrn.s32 d2, q5, #15;\n"
+ "vshrn.s32 d3, q6, #15;\n"
+
+ "pld [%3, #-16];\n"
+
+ "vst1.16 {q0}, [%0]!;\n"
+ "vst1.16 {q1}, [%1];\n"
+
+ "subs %6, %6, #1;\n"
+ "bne .celt_autocorr_process8_%=;\n"
+ ".celt_autocorr_process8_done_%=:\n"
+
+ "ands %6, %5, #7;\n"
+ "beq .celt_autocorr_done_%=;\n"
+
+ /* Process remaining sample */
+ ".celt_autocorr_process_remaining_%=:\n"
+ "subs %3, %3, #2;\n"
+ "subs %1, %1, #2;\n"
+ "vld1.16 d4[0], [%4]!;\n" //Load 1 window value
+ "vld1.16 d0[0], [%2]!;\n" //Load 1 x value from beg
+ "vld1.16 d0[1], [%3];\n" //Load 1 x value from end
+
+ "vmull.s16 q3, d0, d4[0];\n"
+ "vshrn.s32 d0, q3, #15;\n"
+
+ "vst1.16 d0[0], [%0]!;\n"
+ "vst1.16 d0[1], [%1];\n"
+
+ "subs %6, %6, #1;\n"
+ "bne .celt_autocorr_process_remaining_%=;\n"
+ ".celt_autocorr_done_%=:\n"
+ : "=r"(scratch0), "=r"(scratch1),
"=r"(scratch2), "=r"(scratch3),
+ "=r"(scratch4), "=r"(overlap), "=r"(i)
+ : "0"(xxbeg), "1"(xxend), "2"(xbeg),
"3"(xend), "4"(window), "5"(overlap)
+ /* Clobber d0-d13 because some gcc version (4.4.3) don't aliase
q(x) to
+ * d(x), d(x+1) */
+ : "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+ "d9", "d10", "d11", "d12",
"d13"
+ );
+ }
+
+
+ {
+ opus_val32 ac0;
+ int shift;
+ int scratch1;
+ __asm__ __volatile__(
+ "veor.s32 q0, q0, q0;\n"
+ "movs %3, %2, lsr #3;\n"
+ "beq .celt_autocorr_process8_done_%=;\n"
+
+ /* Process 8 samples at a time */
+ ".celt_autocorr_process8_%=:\n"
+ "vld1.16 {q1}, [%1]!\n" //Load 8 xx values
+ "subs %3, %3, #1;\n"
+ "vmull.s16 q2, d2, d2;\n" //MULT16_16(xx[i], xx[i]
+ "vmull.s16 q3, d3, d3;\n" //MULT16_16(xx[i], xx[i]
+ "pld [%1, #0];\n"
+ "vsra.s32 q0, q2, #9;\n" //Shift right by 9 and accumulate to
ac0
+ "vsra.s32 q0, q3, #9;\n" //Shift right by 9 and accumulate to
ac0
+ "bne .celt_autocorr_process8_%=;\n"
+
+ ".celt_autocorr_process8_done_%=:\n"
+ "ands %3, %2, #7;\n"
+ "beq .celt_autocorr_process_remaining_done_%=;\n"
+
+ /* Process remaining samples */
+ "veor.s16 q1, q1, q1;\n" //Clear q1 to not accumulate bad values
+ ".celt_autocorr_process_remaining_%=:\n"
+ "vld1.16 d2[0], [%1]!;\n" //Load 1 xx values
+ "subs %3, %3, #1;\n"
+ "vmull.s16 q2, d2, d2;\n" //MULT16_16(xx[i], xx[i]
+ "vsra.s32 q0, q2, #9;\n" //Shift right by 9 and accumulate to
ac0
+ "bne .celt_autocorr_process_remaining_%=;\n"
+ ".celt_autocorr_process_remaining_done_%=:\n"
+
+ /* Reduce sum and move result to ARM register */
+ "vadd.s32 d0, d0, d1;\n"
+ "vpadd.s32 d0, d0;\n"
+ "vmov.s32 %0, d0[0];\n"
+ : "=r"(ac0), "=r"(scratch1), "=r"(n),
"=r"(i)
+ : "1"(xx), "2"(n)
+ /* Clobber d0-d7 because some gcc version (4.4.3) don't aliase q(x)
to
+ * d(x), d(x+1) */
+ : "cc", "d0", "d1", "d2",
"d3", "d4", "d5", "d6", "d7"
+ );
+ ac0 += 1+n;
+
+ shift = celt_ilog2(ac0)-30+10;
+ shift = (shift+1)/2;
+ for(i=0;i<n;i++)
+ xx[i] = VSHR32(xx[i], shift);
+ }
+
+ while (lag>=0)
+ {
+ opus_val16 * xx1 = xx+lag;
+ opus_val16 * xx2 = xx;
+ int scratch4, scratch5;
+
+ __asm__ __volatile__(
+ "veor.s32 q0, q0;\n" //Clear sum, q0 will contain 4 sum
+ "movs %3, %4, lsr #4;\n" //(n-lag)/16
+ "beq .celt_autocorr_process16_done_%=;\n"
+
+ /* Process 16 samples at a time */
+ ".celt_autocorr_process16_%=:\n"
+ "vld1.16 {q1-q2}, [%1]!;\n" //Load 16 xx values from xx+lag=xx[i]
+ "vld1.16 {q3-q4}, [%2]!;\n" //Load 16 xx values from xx=xx[i-lag]
+ "vmlal.s16 q0, d2, d6;\n" //MAC16_16(d, xx[i], xx[i-lag])
+ "vmlal.s16 q0, d3, d7;\n" //MAC16_16(d, xx[i], xx[i-lag]);
+ "pld [%1, #0];\n"
+ "pld [%2, #0];\n"
+ "vmlal.s16 q0, d4, d8;\n" //MAC16_16(d, xx[i], xx[i-lag]);
+ "vmlal.s16 q0, d5, d9;\n" //MAC16_16(d, xx[i], xx[i-lag]);
+ "pld [%1, #16];\n"
+ "pld [%2, #16];\n"
+ "subs %3, %3, #1;\n"
+ "bne .celt_autocorr_process16_%=;\n"
+ ".celt_autocorr_process16_done_%=:\n"
+
+ "ands %3, %4, #15;\n" //(n-lag)&15
+ "beq .celt_autocorr_process_remaining_done_%=;\n"
+ "veor.s32 q1, q1, q1;\n" //Clear q1
+
+ /* Process remaining samples one by one */
+ ".celt_autocorr_process_remaining_%=:\n"
+ "vld1.16 d2[0], [%1]!;\n" //Load 1 xx value from xx+lag=xx[i]
+ "vld1.16 d3[0], [%2]!;\n" //Load 1 xx value from xx=xx[i-lag]
+ "subs %3, %3, #1;\n"
+ "vmlal.s16 q0, d2, d3;\n" //MAC16_16(d, xx[i], xx[i-lag])
+ "bne .celt_autocorr_process_remaining_%=;\n"
+ ".celt_autocorr_process_remaining_done_%=:\n"
+
+ /* Reduce sum and store it */
+ "vadd.s32 d0, d0, d1;\n"
+ "vpadd.s32 d0, d0;\n"
+ "vst1.32 d0[0], [%5];\n"
+ : "=&r"(d), "=r"(xx1), "=r"(xx2),
"=r"(i), "=r"(scratch4), "=r"(scratch5)
+ : "0"(0), "1"(xx1), "2"(xx2),
"4"(n-lag), "5"(ac+lag)
+ /* Clobber d0-d9 because some gcc version (4.4.3) don't aliase q(x)
to
+ * d(x), d(x+1) */
+ : "cc", "memory", "d0", "d1",
"d2", "d3", "d4", "d5", "d6",
"d7", "d8",
+ "d9"
+ );
+ lag--;
+ }
+ ac[0] += 10;
+
+ RESTORE_STACK;
+}
+#endif /* CELT_AUTOCORR_NEON */
+
+#endif /* FIXED_POINT */
+
+#endif