Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
Should call celt_inner_prod().
---
celt/bands.c | 7 ++++---
celt/bands.h | 2 +-
celt/celt_encoder.c | 6 +++---
celt/pitch.c | 2 +-
src/opus_multistream_encoder.c | 2 +-
5 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/celt/bands.c b/celt/bands.c
index bbe8a4c..1ab24aa 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -92,10 +92,11 @@ static int bitexact_log2tan(int isin,int icos)
#ifdef FIXED_POINT
/* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM, int arch)
{
int i, c, N;
const opus_int16 *eBands = m->eBands;
+ (void)arch;
N = m->shortMdctSize<<LM;
c=0; do {
for (i=0;i<end;i++)
@@ -155,7 +156,7 @@ void normalise_bands(const CELTMode *m, const celt_sig *
OPUS_RESTRICT freq, cel
#else /* FIXED_POINT */
/* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM, int arch)
{
int i, c, N;
const opus_int16 *eBands = m->eBands;
@@ -164,7 +165,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig
*X, celt_ener *band
for (i=0;i<end;i++)
{
opus_val32 sum;
- sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)],
&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
+ sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)],
&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch);
bandE[i+c*m->nbEBands] = celt_sqrt(sum);
/*printf ("%f ", bandE[i+c*m->nbEBands]);*/
}
diff --git a/celt/bands.h b/celt/bands.h
index c040c7f..61ae0cd 100644
--- a/celt/bands.h
+++ b/celt/bands.h
@@ -41,7 +41,7 @@
* @param X Spectrum
* @param bandE Square root of the energy for each band (returned)
*/
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM);
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener
*bandE, int end, int C, int LM, int arch);
/*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const
opus_val16 *tonality, celt_ener *bandE);*/
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 5be7610..8af61d5 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1606,7 +1606,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st,
const opus_val16 * pcm,
if (secondMdct)
{
compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample,
st->arch);
- compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+ compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
for (i=0;i<C*nbEBands;i++)
bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
@@ -1615,7 +1615,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st,
const opus_val16 * pcm,
compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample,
st->arch);
if (CC==2&&C==1)
tf_chan = 0;
- compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+ compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
if (st->lfe)
{
@@ -1739,7 +1739,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st,
const opus_val16 * pcm,
isTransient = 1;
shortBlocks = M;
compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample,
st->arch);
- compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+ compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
/* Compensate for the scaling of short vs long mdcts */
for (i=0;i<C*nbEBands;i++)
diff --git a/celt/pitch.c b/celt/pitch.c
index bf46e7d..f944a33 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -378,7 +378,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp,
opus_val16 * OPUS_RESTR
for (j=0;j<len>>1;j++)
sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
#else
- sum = celt_inner_prod_c(x_lp, y+i, len>>1);
+ sum = celt_inner_prod(x_lp, y+i, len>>1, arch);
#endif
xcorr[i] = MAX32(-1, sum);
#ifdef FIXED_POINT
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index c07132f..6ecea5d 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -295,7 +295,7 @@ void surround_analysis(const CELTMode *celt_mode, const void
*pcm, opus_val16 *b
freq[i] = 0;
}
- compute_band_energies(celt_mode, freq, bandE, 21, 1, LM);
+ compute_band_energies(celt_mode, freq, bandE, 21, 1, LM, arch);
amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1);
/* Apply spreading function with -6 dB/band going up and -12 dB/band
going down. */
for (i=1;i<21;i++)
--
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 13/15] Replace call of celt_inner_prod_c() (step 2)
Should call celt_inner_prod().
This requires the API change of celt_pitch_xcorr() by passing in
"arch".
---
celt/arm/arm_celt_map.c | 4 ++--
celt/arm/celt_neon_intr.c | 3 ++-
celt/arm/pitch_arm.h | 24 +++++++++---------------
celt/pitch.c | 13 -------------
celt/pitch.h | 11 ++---------
5 files changed, 15 insertions(+), 40 deletions(-)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 1d53d62..6e28c70 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -51,7 +51,7 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16
*,
(defined(OPUS_ARM_MAY_HAVE_MEDIA) &&
!defined(OPUS_ARM_PRESUME_MEDIA)) || \
(defined(OPUS_ARM_MAY_HAVE_EDSP) &&
!defined(OPUS_ARM_PRESUME_EDSP)))
opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
- const opus_val16 *, opus_val32 *, int , int) = {
+ const opus_val16 *, opus_val32 *, int, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
@@ -62,7 +62,7 @@ opus_val32 (*const
CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
# else /* !FIXED_POINT */
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
- const opus_val16 *, opus_val32 *, int, int) = {
+ const opus_val16 *, opus_val32 *, int, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index 47bbe3d..f8f1d43 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -290,8 +290,9 @@ static void xcorr_kernel_neon_float_process1(const float32_t
*x,
}
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch) {
+ opus_val32 *xcorr, int len, int max_pitch, int arch) {
int i;
+ (void)arch;
celt_assert(max_pitch > 0);
celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index 1433116..d8b022e 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -34,7 +34,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON)
opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch);
+ opus_val32 *xcorr, int len, int max_pitch, int arch);
# endif
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
@@ -43,7 +43,7 @@ opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const
opus_val16 *_y,
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch);
+ opus_val32 *xcorr, int len, int max_pitch, int arch);
# endif
# if defined(OPUS_HAVE_RTCD) && \
@@ -52,18 +52,15 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const
opus_val16 *_y,
(defined(OPUS_ARM_MAY_HAVE_EDSP) &&
!defined(OPUS_ARM_PRESUME_EDSP)))
extern opus_val32
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
- const opus_val16 *, opus_val32 *, int, int);
+ const opus_val16 *, opus_val32 *, int, int, int);
# define OVERRIDE_PITCH_XCORR (1)
-# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
- ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
- xcorr, len, max_pitch))
+# define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
# elif defined(OPUS_ARM_PRESUME_EDSP) || \
defined(OPUS_ARM_PRESUME_MEDIA) || \
defined(OPUS_ARM_PRESUME_NEON)
# define OVERRIDE_PITCH_XCORR (1)
-# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
- ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+# define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
# endif
@@ -99,25 +96,22 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
/* Float case */
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch);
+ opus_val32 *xcorr, int len, int max_pitch, int
arch);
#endif
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
- const opus_val16 *, opus_val32 *, int, int);
+ const opus_val16 *, opus_val32 *, int, int, int);
# define OVERRIDE_PITCH_XCORR (1)
-# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
- ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
- xcorr, len, max_pitch))
+# define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_PITCH_XCORR (1)
-# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
- ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
+# define celt_pitch_xcorr celt_pitch_xcorr_float_neon
# endif
diff --git a/celt/pitch.c b/celt/pitch.c
index f944a33..874929e 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -220,13 +220,8 @@ opus_val32
#else
void
#endif
-#if defined(OVERRIDE_PITCH_XCORR)
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch)
-#else
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch)
-#endif
{
#if 0 /* This is a simple version of the pitch correlation that should work
@@ -265,11 +260,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16
*_y,
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
-#if defined(OVERRIDE_PITCH_XCORR)
- xcorr_kernel_c(_x, _y+i, sum, len);
-#else
xcorr_kernel(_x, _y+i, sum, len, arch);
-#endif
xcorr[i]=sum[0];
xcorr[i+1]=sum[1];
xcorr[i+2]=sum[2];
@@ -285,11 +276,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16
*_y,
for (;i<max_pitch;i++)
{
opus_val32 sum;
-#if defined(OVERRIDE_PITCH_XCORR)
- sum = celt_inner_prod_c(_x, _y+i, len);
-#else
sum = celt_inner_prod(_x, _y+i, len, arch);
-#endif
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
diff --git a/celt/pitch.h b/celt/pitch.h
index d350353..d797844 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -184,17 +184,10 @@ opus_val32
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch);
-
-#if !defined(OVERRIDE_PITCH_XCORR)
-#ifdef FIXED_POINT
-opus_val32
-#else
-void
-#endif
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch);
+#ifndef OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_c
#endif
#endif
--
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 14/15] Optimize celt_inner_prod() and dual_inner_prod() for ARM NEON
Created corresponding unit test.
The fixed-point optimizations are bit exact with C functions.
The floating-point optimizations are not bit exact with C functions, because of
the order changes of floating-point operations. But they are bit exact with the
simulation C functions which stimulate the floating operations in the
optimizations.
---
celt/arm/arm_celt_map.c | 17 ++
celt/arm/pitch_arm.h | 36 ++++
celt/arm/pitch_neon_intr.c | 179 ++++++++++++++++++++
celt/pitch.h | 3 +-
celt/tests/test_unit_dft.c | 1 +
celt/tests/test_unit_mathops.c | 1 +
celt/tests/test_unit_mdct.c | 1 +
celt/tests/test_unit_optimization_pitch.c | 263 ++++++++++++++++++++++++++++++
celt/tests/test_unit_rotation.c | 1 +
celt/x86/pitch_sse.h | 5 +-
celt_sources.mk | 3 +-
tests/test_unit_optimization.c | 4 +
12 files changed, 508 insertions(+), 6 deletions(-)
create mode 100644 celt/arm/pitch_neon_intr.c
create mode 100644 celt/tests/test_unit_optimization_pitch.c
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 6e28c70..a1a553a 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -36,6 +36,23 @@
#if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
const opus_val16 *y, int N) = {
+ celt_inner_prod_c, /* ARMv4 */
+ celt_inner_prod_c, /* EDSP */
+ celt_inner_prod_c, /* Media */
+ MAY_HAVE_NEON(celt_inner_prod) /* NEON */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const
opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2) = {
+ dual_inner_prod_c, /* ARMv4 */
+ dual_inner_prod_c, /* EDSP */
+ dual_inner_prod_c, /* Media */
+ MAY_HAVE_NEON(dual_inner_prod) /* NEON */
+};
+# endif
+
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index d8b022e..d1a8db0 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -30,6 +30,42 @@
# include "armcpu.h"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int
N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+ const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_CELT_INNER_PROD (1)
+# define OVERRIDE_DUAL_INNER_PROD (1)
+# define celt_inner_prod(x, y, N, arch) ((void)(arch),
PRESUME_NEON(celt_inner_prod)(x, y, N))
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),
PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
&& !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const
opus_val16 *x, const opus_val16 *y, int N);
+# define OVERRIDE_CELT_INNER_PROD (1)
+# define celt_inner_prod(x, y, N, arch)
((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_CELT_INNER_PROD (1)
+# define celt_inner_prod(x, y, N, arch) ((void)(arch),
celt_inner_prod_neon(x, y, N))
+# endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
&& !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+ const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1,
opus_val32 *xy2);
+# define OVERRIDE_DUAL_INNER_PROD (1)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch)
((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_DUAL_INNER_PROD (1)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),
dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+# endif
+# endif
+
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON)
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000..2bda6e1
--- /dev/null
+++ b/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+ @file pitch_neon_intr.c
+ @brief ARM Neon Intrinsic optimizations for celt pitch functions
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int
N)
+{
+ int i;
+ opus_val32 xy;
+
+#ifdef FIXED_POINT
+ int16x8_t x_s16x8, y_s16x8;
+ int32x4_t xy_s32x4 = vdupq_n_s32(0);
+ int64x2_t xy_s64x2;
+ int64x1_t xy_s64x1;
+
+ for (i = 0; i < N - 7; i += 8) {
+ x_s16x8 = vld1q_s16(&x[i]);
+ y_s16x8 = vld1q_s16(&y[i]);
+ xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16
(y_s16x8));
+ xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8),
vget_high_s16(y_s16x8));
+ }
+
+ if (N - i >= 4) {
+ const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+ const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+ xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+ i += 4;
+ }
+
+ xy_s64x2 = vpaddlq_s32(xy_s32x4);
+ xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+ xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+#else
+ float32x4_t xy_f32x4 = vdupq_n_f32(0);
+ float32x2_t xy_f32x2;
+
+ for (i = 0; i < N - 7; i += 8) {
+ float32x4_t x_f32x4, y_f32x4;
+ x_f32x4 = vld1q_f32(&x[i]);
+ y_f32x4 = vld1q_f32(&y[i]);
+ xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+ x_f32x4 = vld1q_f32(&x[i + 4]);
+ y_f32x4 = vld1q_f32(&y[i + 4]);
+ xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+ }
+
+ if (N - i >= 4) {
+ const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+ const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+ xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+ i += 4;
+ }
+
+ xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+ xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+ xy = vget_lane_f32(xy_f32x2, 0);
+#endif
+
+ for (; i < N; i++) {
+ xy = MAC16_16(xy, x[i], y[i]);
+ }
+ return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const
opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ opus_val32 xy01, xy02;
+
+#ifdef FIXED_POINT
+ int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+ int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+ int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+ int64x2_t xy01_s64x2, xy02_s64x2;
+ int64x1_t xy01_s64x1, xy02_s64x1;
+
+ for (i = 0; i < N - 7; i += 8) {
+ x_s16x8 = vld1q_s16(&x[i]);
+ y01_s16x8 = vld1q_s16(&y01[i]);
+ y02_s16x8 = vld1q_s16(&y02[i]);
+ xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16
(y01_s16x8));
+ xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16
(y02_s16x8));
+ xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8),
vget_high_s16(y01_s16x8));
+ xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8),
vget_high_s16(y02_s16x8));
+ }
+
+ if (N - i >= 4) {
+ const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+ const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+ const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+ xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+ xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+ i += 4;
+ }
+
+ xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+ xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+ xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+ xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+ xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+ xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+#else
+ float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+ float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+ float32x2_t xy01_f32x2, xy02_f32x2;
+
+ for (i = 0; i < N - 7; i += 8) {
+ float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+ x_f32x4 = vld1q_f32(&x[i]);
+ y01_f32x4 = vld1q_f32(&y01[i]);
+ y02_f32x4 = vld1q_f32(&y02[i]);
+ xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+ xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+ x_f32x4 = vld1q_f32(&x[i + 4]);
+ y01_f32x4 = vld1q_f32(&y01[i + 4]);
+ y02_f32x4 = vld1q_f32(&y02[i + 4]);
+ xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+ xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+ }
+
+ if (N - i >= 4) {
+ const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+ const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+ const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+ xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+ xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+ i += 4;
+ }
+
+ xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+ xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+ xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+ xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+ xy01 = vget_lane_f32(xy01_f32x2, 0);
+ xy02 = vget_lane_f32(xy02_f32x2, 0);
+#endif
+
+ for (; i < N; i++) {
+ xy01 = MAC16_16(xy01, x[i], y01[i]);
+ xy02 = MAC16_16(xy02, x[i], y02[i]);
+ }
+ *xy1 = xy01;
+ *xy2 = xy02;
+}
diff --git a/celt/pitch.h b/celt/pitch.h
index d797844..e425f56 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,8 +46,7 @@
#include "mips/pitch_mipsr1.h"
#endif
-#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
- || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
# include "arm/pitch_arm.h"
#endif
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 582618e..02904bf 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -54,6 +54,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "mdct.c"
# include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index f5af994..524c1f8 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -69,6 +69,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "kiss_fft.c"
# include "mdct.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 0658c7a..3b28767 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -55,6 +55,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "arm/celt_ne10_fft.c"
# include "arm/celt_ne10_mdct.c"
diff --git a/celt/tests/test_unit_optimization_pitch.c
b/celt/tests/test_unit_optimization_pitch.c
new file mode 100644
index 0000000..64bb2a9
--- /dev/null
+++ b/celt/tests/test_unit_optimization_pitch.c
@@ -0,0 +1,263 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "modes.h"
+#include "pitch.h"
+
+#define MAX_LEN_INNER_PROD 960
+
+#ifndef UNIT_TEST_CELT_INNER_PROD
+#define UNIT_TEST_CELT_INNER_PROD
+
+static inline float rand_float(float min, float max)
+{
+ return ((max - min) * ((float)rand() / RAND_MAX)) + min;
+}
+
+static OPUS_INLINE opus_val16 rand_val16(opus_val16 min, opus_val16 max)
+{
+#ifdef FIXED_POINT
+ (void)min;
+ (void)max;
+ return rand();
+#else
+ return rand_float(min, max);
+#endif
+}
+
+static OPUS_INLINE void init_val16_buffer(opus_val16* buffer, int num)
+{
+ const opus_val16 min = (opus_val16)-1e10;
+ const opus_val16 max = (opus_val16) 1e10;
+
+ for (int i = 0; i < num; i++) {
+ buffer[i] = rand_val16(min, max);
+ }
+}
+
+#endif
+
+/* ==========================================================================
*/
+/* This part of code simulates floating-point operations. */
+
+#ifndef FIXED_POINT
+
+/* celt_inner_prod_float_simulation_sse() simulates the floating operations of
+ * celt_inner_prod_sse(), and both functions should have bit exact output.
+ */
+opus_val32 celt_inner_prod_float_simulation_sse(const opus_val16 *x,
+ const opus_val16 *y, int N)
+{
+ int i;
+ opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+ for (i = 0; i < N - 3; i += 4) {
+ xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+ xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+ xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+ xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+ }
+ xy0 += xy2;
+ xy1 += xy3;
+ xy = xy0 + xy1;
+ for (; i < N; i++) {
+ xy = MAC16_16(xy, x[i], y[i]);
+ }
+ return xy;
+}
+
+/* dual_inner_prod_float_simulation_sse() simulates the floating-point
operations
+ * of dual_inner_prod_sse(), and both functions should have bit exact output.
+ */
+void dual_inner_prod_float_simulation_sse(const opus_val16 *x, const opus_val16
*y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0,
xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+ for (i = 0; i < N - 3; i += 4) {
+ xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+ xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+ xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+ xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+ xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+ xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+ xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+ xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+ }
+ xy01_0 += xy01_2;
+ xy02_0 += xy02_2;
+ xy01_1 += xy01_3;
+ xy02_1 += xy02_3;
+ xy01 = xy01_0 + xy01_1;
+ xy02 = xy02_0 + xy02_1;
+ for (; i < N; i++) {
+ xy01 = MAC16_16(xy01, x[i], y01[i]);
+ xy02 = MAC16_16(xy02, x[i], y02[i]);
+ }
+ *xy1 = xy01;
+ *xy2 = xy02;
+}
+
+# define celt_inner_prod_float_simulation_c celt_inner_prod_c
+# define dual_inner_prod_float_simulation_c dual_inner_prod_c
+
+/* Reuse since NEON optimizations happen to have the same simulated
floating-point operations as SSE optimization. */
+# define celt_inner_prod_float_simulation_neon
celt_inner_prod_float_simulation_sse
+# define dual_inner_prod_float_simulation_neon
dual_inner_prod_float_simulation_sse
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+# define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+# define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+# ifdef OPUS_X86_PRESUME_SSE
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),
celt_inner_prod_float_simulation_sse(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch), dual_inner_prod_float_simulation_sse(x, y01, y02, N, xy1, xy2))
+# else
+# define celt_inner_prod_float_simulation(x, y, N, arch)
((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01,
y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK +
1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+ celt_inner_prod_float_simulation_c, /* non-sse */
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation)
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const
opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32
*xy1, opus_val32 *xy2) = {
+ dual_inner_prod_float_simulation_c, /* non-sse */
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation)
+};
+# endif /* !defined(OPUS_X86_PRESUME_SSE) */
+# endif /* OPUS_X86_MAY_HAVE_SSE */
+
+# ifdef OPUS_ARM_MAY_HAVE_NEON_INTR
+# define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+# define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+# ifndef OPUS_HAVE_RTCD
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),
PRESUME_NEON(celt_inner_prod_float_simulation)(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch), PRESUME_NEON(dual_inner_prod_float_simulation)(x, y01, y02, N,
xy1, xy2))
+# else
+# ifdef OPUS_ARM_PRESUME_NEON_INTR
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),
celt_inner_prod_float_simulation_neon(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch), dual_inner_prod_float_simulation_neon(x, y01, y02, N, xy1, xy2))
+# else
+# define celt_inner_prod_float_simulation(x, y, N, arch)
((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01,
y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK +
1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+ celt_inner_prod_float_simulation_c, /* ARMv4 */
+ celt_inner_prod_float_simulation_c, /* EDSP */
+ celt_inner_prod_float_simulation_c, /* Media */
+ MAY_HAVE_NEON(celt_inner_prod_float_simulation) /* NEON */
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const
opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32
*xy1, opus_val32 *xy2) = {
+ dual_inner_prod_float_simulation_c, /* ARMv4 */
+ dual_inner_prod_float_simulation_c, /* EDSP */
+ dual_inner_prod_float_simulation_c, /* Media */
+ MAY_HAVE_NEON(dual_inner_prod_float_simulation) /* NEON */
+};
+# endif /* !defined(OPUS_ARM_PRESUME_NEON_INTR) */
+# endif /* OPUS_HAVE_RTCD */
+# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+# ifndef OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION
+# define celt_inner_prod_float_simulation(x, y, N, arch)
((void)(arch),celt_inner_prod_float_simulation_c(x, y, N))
+# endif
+
+# ifndef OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch)
((void)(arch),dual_inner_prod_float_simulation_c(x, y01, y02, N, xy1, xy2))
+# endif
+
+#endif /* !FIXED_POINT */
+
+/* ==========================================================================
*/
+
+static int test_celt_inner_prod(int arch)
+{
+ opus_val16 x[MAX_LEN_INNER_PROD], y[MAX_LEN_INNER_PROD];
+ opus_val32 xy_org, xy_opt;
+ int N;
+
+ printf("%44s() ...", __func__);
+ init_val16_buffer(x, MAX_LEN_INNER_PROD);
+ init_val16_buffer(y, MAX_LEN_INNER_PROD);
+ for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+ xy_org = celt_inner_prod_c(x, y, N);
+#else
+ xy_org = celt_inner_prod_float_simulation(x, y, N, arch);
+#endif
+ xy_opt = celt_inner_prod(x, y, N, arch);
+ if (xy_org != xy_opt) {
+#ifdef FIXED_POINT
+ printf("\nN=%d xy_org = %d, xy_opt = %d failed!", N,
xy_org, xy_opt);
+#else
+ printf("\nN=%d xy_org = %f, xy_opt = %f failed!", N,
xy_org, xy_opt);
+#endif
+ return -1;
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
+
+static int test_dual_inner_prod(int arch)
+{
+ opus_val16 x[MAX_LEN_INNER_PROD], y01[MAX_LEN_INNER_PROD],
y02[MAX_LEN_INNER_PROD];
+ opus_val32 xy1_org, xy1_opt, xy2_org, xy2_opt;
+ int N;
+
+ printf("%44s() ...", __func__);
+ init_val16_buffer(x, MAX_LEN_INNER_PROD);
+ init_val16_buffer(y01, MAX_LEN_INNER_PROD);
+ init_val16_buffer(y02, MAX_LEN_INNER_PROD);
+ for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+ dual_inner_prod_c(x, y01, y02, N, &xy1_org, &xy2_org);
+#else
+ dual_inner_prod_float_simulation(x, y01, y02, N, &xy1_org,
&xy2_org, arch);
+#endif
+ dual_inner_prod(x, y01, y02, N, &xy1_opt, &xy2_opt, arch);
+ if ((xy1_org != xy1_opt) || (xy2_org != xy2_opt)) {
+#ifdef FIXED_POINT
+ printf("\nN=%d xy1_org = %d, xy1_opt = %d failed!", N,
xy1_org, xy1_opt);
+ printf("\nN=%d xy2_org = %d, xy2_opt = %d failed!", N,
xy2_org, xy2_opt);
+#else
+ printf("\nN=%d xy1_org = %f, xy1_opt = %f failed!", N,
xy1_org, xy1_opt);
+ printf("\nN=%d xy2_org = %f, xy2_opt = %f failed!", N,
xy2_org, xy2_opt);
+#endif
+ return -1;
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 785b40d..0b73839 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -67,6 +67,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "kiss_fft.c"
# include "mdct.c"
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index e5f87ab..5e85599 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -91,7 +91,7 @@ opus_val32 celt_inner_prod_sse2(
int N);
#endif
-#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
opus_val32 celt_inner_prod_sse(
const opus_val16 *x,
const opus_val16 *y,
@@ -104,7 +104,7 @@ opus_val32 celt_inner_prod_sse(
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse4_1(x, y, N))
-#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) &&
!defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT)
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse2(x, y, N))
@@ -114,7 +114,6 @@ opus_val32 celt_inner_prod_sse(
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse(x, y, N))
-
#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
&& defined(FIXED_POINT)) || \
(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
diff --git a/celt_sources.mk b/celt_sources.mk
index c4bd285..dc107d9 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -38,7 +38,8 @@ celt/arm/armopts.s.in
CELT_SOURCES_ARM_NEON_INTR = \
celt/arm/celt_lpc_neon_intr.c \
-celt/arm/celt_neon_intr.c
+celt/arm/celt_neon_intr.c \
+celt/arm/pitch_neon_intr.c
CELT_SOURCES_ARM_NE10= \
celt/arm/celt_ne10_fft.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 6155dfb..a88ac21 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -71,6 +71,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "celt/arm/celt_lpc_neon_intr.c"
# include "celt/arm/celt_neon_intr.c"
+# include "celt/arm/pitch_neon_intr.c"
# include "silk/arm/biquad_alt_neon_intr.c"
# include "silk/arm/inner_prod_aligned_neon_intr.c"
# include "silk/arm/LPC_analysis_filter_neon_intr.c"
@@ -94,6 +95,7 @@
#endif
+# include "celt/tests/test_unit_optimization_pitch.c"
# include "silk/tests/test_unit_optimization_biquad_alt.c"
# include "silk/tests/test_unit_optimization_inner_prod_aligned.c"
# include "silk/tests/test_unit_optimization_LPC_analysis_filter.c"
@@ -118,6 +120,8 @@ int main(void)
result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
result |= test_warped_autocorrelation(arch);
#endif /* FIXED_POINT */
+ result |= test_celt_inner_prod(arch);
+ result |= test_dual_inner_prod(arch);
result |= test_silk_biquad_alt(arch);
result |= test_silk_inner_prod_aligned_scale(arch);
result |= test_silk_LPC_analysis_filter(arch);
--
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 15/15] Clean celt_pitch_xcorr_float_neon()
Call celt_inner_prod_neon() and remove redundant code.
---
celt/arm/celt_neon_intr.c | 105 +---------------------------------------------
1 file changed, 2 insertions(+), 103 deletions(-)
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index f8f1d43..cf44398 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -191,104 +191,6 @@ static void xcorr_kernel_neon_float(const float32_t *x,
const float32_t *y,
vst1q_f32(sum, SUMM);
}
-/*
- * Function: xcorr_kernel_neon_float_process1
- * ---------------------------------
- * Computes single correlation values and stores in *sum
- */
-static void xcorr_kernel_neon_float_process1(const float32_t *x,
- const float32_t *y, float32_t *sum, int len) {
- float32x4_t XX[4];
- float32x4_t YY[4];
- float32x2_t XX_2;
- float32x2_t YY_2;
- float32x4_t SUMM;
- float32x2_t SUMM_2[2];
- const float32_t *xi = x;
- const float32_t *yi = y;
-
- SUMM = vdupq_n_f32(0);
-
- /* Work on 16 values per iteration */
- while (len >= 16) {
- XX[0] = vld1q_f32(xi);
- xi += 4;
- XX[1] = vld1q_f32(xi);
- xi += 4;
- XX[2] = vld1q_f32(xi);
- xi += 4;
- XX[3] = vld1q_f32(xi);
- xi += 4;
-
- YY[0] = vld1q_f32(yi);
- yi += 4;
- YY[1] = vld1q_f32(yi);
- yi += 4;
- YY[2] = vld1q_f32(yi);
- yi += 4;
- YY[3] = vld1q_f32(yi);
- yi += 4;
-
- SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
- SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
- SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
- SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
- len -= 16;
- }
-
- /* Work on 8 values */
- if (len >= 8) {
- XX[0] = vld1q_f32(xi);
- xi += 4;
- XX[1] = vld1q_f32(xi);
- xi += 4;
-
- YY[0] = vld1q_f32(yi);
- yi += 4;
- YY[1] = vld1q_f32(yi);
- yi += 4;
-
- SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
- SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
- len -= 8;
- }
-
- /* Work on 4 values */
- if (len >= 4) {
- XX[0] = vld1q_f32(xi);
- xi += 4;
- YY[0] = vld1q_f32(yi);
- yi += 4;
- SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
- len -= 4;
- }
-
- /* Start accumulating results */
- SUMM_2[0] = vget_low_f32(SUMM);
- if (len >= 2) {
- /* While at it, consume 2 more values if available */
- XX_2 = vld1_f32(xi);
- xi += 2;
- YY_2 = vld1_f32(yi);
- yi += 2;
- SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
- len -= 2;
- }
- SUMM_2[1] = vget_high_f32(SUMM);
- SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
- SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
- /* Ok, now we have result accumulated in SUMM_2[0].0 */
-
- if (len > 0) {
- /* Case when you have one value left */
- XX_2 = vld1_dup_f32(xi);
- YY_2 = vld1_dup_f32(yi);
- SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
- }
-
- vst1_lane_f32(sum, SUMM_2[0], 0);
-}
-
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch) {
int i;
@@ -301,12 +203,9 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x,
const opus_val16 *_y,
(float32_t *)xcorr+i, len);
}
- /* In case max_pitch isn't multiple of 4
- * compute single correlation value per iteration
- */
+ /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
for (; i < max_pitch; i++) {
- xcorr_kernel_neon_float_process1((const float32_t *)_x,
- (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
+ xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
}
}
#endif
--
2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:17 UTC
[opus] [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
I'm attaching the previous 11 NEON optimization patches in a zip file, which is the same as the one I sent on September 6th in reply to thread "[PATCH 9/9] Optimize silk_inner_prod_aligned_scale() for ARM NEON". Thanks, Linfeng On Mon, Sep 12, 2016 at 5:03 PM, Linfeng Zhang <linfengz at google.com> wrote:> Should call celt_inner_prod(). > --- > celt/bands.c | 7 ++++--- > celt/bands.h | 2 +- > celt/celt_encoder.c | 6 +++--- > celt/pitch.c | 2 +- > src/opus_multistream_encoder.c | 2 +- > 5 files changed, 10 insertions(+), 9 deletions(-) > > diff --git a/celt/bands.c b/celt/bands.c > index bbe8a4c..1ab24aa 100644 > --- a/celt/bands.c > +++ b/celt/bands.c > @@ -92,10 +92,11 @@ static int bitexact_log2tan(int isin,int icos) > > #ifdef FIXED_POINT > /* Compute the amplitude (sqrt energy) in each of the bands */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM) > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch) > { > int i, c, N; > const opus_int16 *eBands = m->eBands; > + (void)arch; > N = m->shortMdctSize<<LM; > c=0; do { > for (i=0;i<end;i++) > @@ -155,7 +156,7 @@ void normalise_bands(const CELTMode *m, const celt_sig > * OPUS_RESTRICT freq, cel > > #else /* FIXED_POINT */ > /* Compute the amplitude (sqrt energy) in each of the bands */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM) > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch) > { > int i, c, N; > const opus_int16 *eBands = m->eBands; > @@ -164,7 +165,7 @@ void compute_band_energies(const CELTMode *m, const > celt_sig *X, celt_ener *band > for (i=0;i<end;i++) > { > opus_val32 sum; > - sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], > &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM); > + sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], > &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch); > bandE[i+c*m->nbEBands] = celt_sqrt(sum); > /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ > } > diff --git a/celt/bands.h b/celt/bands.h > index c040c7f..61ae0cd 100644 > --- a/celt/bands.h > +++ b/celt/bands.h > @@ -41,7 +41,7 @@ > * @param X Spectrum > * @param bandE Square root of the energy for each band (returned) > */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM); > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch); > > /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const > opus_val16 *tonality, celt_ener *bandE);*/ > > diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c > index 5be7610..8af61d5 100644 > --- a/celt/celt_encoder.c > +++ b/celt/celt_encoder.c > @@ -1606,7 +1606,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > if (secondMdct) > { > compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); > amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); > for (i=0;i<C*nbEBands;i++) > bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); > @@ -1615,7 +1615,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, > st->arch); > if (CC==2&&C==1) > tf_chan = 0; > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); > > if (st->lfe) > { > @@ -1739,7 +1739,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > isTransient = 1; > shortBlocks = M; > compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, > st->upsample, st->arch); > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, > st->arch); > amp2Log2(mode, effEnd, end, bandE, bandLogE, C); > /* Compensate for the scaling of short vs long mdcts */ > for (i=0;i<C*nbEBands;i++) > diff --git a/celt/pitch.c b/celt/pitch.c > index bf46e7d..f944a33 100644 > --- a/celt/pitch.c > +++ b/celt/pitch.c > @@ -378,7 +378,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT > x_lp, opus_val16 * OPUS_RESTR > for (j=0;j<len>>1;j++) > sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); > #else > - sum = celt_inner_prod_c(x_lp, y+i, len>>1); > + sum = celt_inner_prod(x_lp, y+i, len>>1, arch); > #endif > xcorr[i] = MAX32(-1, sum); > #ifdef FIXED_POINT > diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_ > encoder.c > index c07132f..6ecea5d 100644 > --- a/src/opus_multistream_encoder.c > +++ b/src/opus_multistream_encoder.c > @@ -295,7 +295,7 @@ void surround_analysis(const CELTMode *celt_mode, > const void *pcm, opus_val16 *b > freq[i] = 0; > } > > - compute_band_energies(celt_mode, freq, bandE, 21, 1, LM); > + compute_band_energies(celt_mode, freq, bandE, 21, 1, LM, arch); > amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1); > /* Apply spreading function with -6 dB/band going up and -12 > dB/band going down. */ > for (i=1;i<21;i++) > -- > 2.8.0.rc3.226.g39d4020 > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20160912/0434e43f/attachment-0001.html> -------------- next part -------------- A non-text attachment was scrubbed... Name: opus-NEON-11-patches.zip Type: application/zip Size: 71375 bytes Desc: not available URL: <http://lists.xiph.org/pipermail/opus/attachments/20160912/0434e43f/attachment-0001.zip>