Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
Should call celt_inner_prod(). --- celt/bands.c | 7 ++++--- celt/bands.h | 2 +- celt/celt_encoder.c | 6 +++--- celt/pitch.c | 2 +- src/opus_multistream_encoder.c | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/celt/bands.c b/celt/bands.c index bbe8a4c..1ab24aa 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -92,10 +92,11 @@ static int bitexact_log2tan(int isin,int icos) #ifdef FIXED_POINT /* Compute the amplitude (sqrt energy) in each of the bands */ -void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM) +void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM, int arch) { int i, c, N; const opus_int16 *eBands = m->eBands; + (void)arch; N = m->shortMdctSize<<LM; c=0; do { for (i=0;i<end;i++) @@ -155,7 +156,7 @@ void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, cel #else /* FIXED_POINT */ /* Compute the amplitude (sqrt energy) in each of the bands */ -void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM) +void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM, int arch) { int i, c, N; const opus_int16 *eBands = m->eBands; @@ -164,7 +165,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band for (i=0;i<end;i++) { opus_val32 sum; - sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM); + sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch); bandE[i+c*m->nbEBands] = celt_sqrt(sum); /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ } diff --git a/celt/bands.h b/celt/bands.h index c040c7f..61ae0cd 100644 --- a/celt/bands.h +++ b/celt/bands.h @@ -41,7 +41,7 @@ * @param X Spectrum * @param bandE Square root of the energy for each band (returned) */ -void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM); +void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM, int arch); /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bandE);*/ diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 5be7610..8af61d5 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1606,7 +1606,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (secondMdct) { compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); - compute_band_energies(mode, freq, bandE, effEnd, C, LM); + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); for (i=0;i<C*nbEBands;i++) bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); @@ -1615,7 +1615,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch); if (CC==2&&C==1) tf_chan = 0; - compute_band_energies(mode, freq, bandE, effEnd, C, LM); + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); if (st->lfe) { @@ -1739,7 +1739,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, isTransient = 1; shortBlocks = M; compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch); - compute_band_energies(mode, freq, bandE, effEnd, C, LM); + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE, C); /* Compensate for the scaling of short vs long mdcts */ for (i=0;i<C*nbEBands;i++) diff --git a/celt/pitch.c b/celt/pitch.c index bf46e7d..f944a33 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -378,7 +378,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR for (j=0;j<len>>1;j++) sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); #else - sum = celt_inner_prod_c(x_lp, y+i, len>>1); + sum = celt_inner_prod(x_lp, y+i, len>>1, arch); #endif xcorr[i] = MAX32(-1, sum); #ifdef FIXED_POINT diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index c07132f..6ecea5d 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -295,7 +295,7 @@ void surround_analysis(const CELTMode *celt_mode, const void *pcm, opus_val16 *b freq[i] = 0; } - compute_band_energies(celt_mode, freq, bandE, 21, 1, LM); + compute_band_energies(celt_mode, freq, bandE, 21, 1, LM, arch); amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1); /* Apply spreading function with -6 dB/band going up and -12 dB/band going down. */ for (i=1;i<21;i++) -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 13/15] Replace call of celt_inner_prod_c() (step 2)
Should call celt_inner_prod(). This requires the API change of celt_pitch_xcorr() by passing in "arch". --- celt/arm/arm_celt_map.c | 4 ++-- celt/arm/celt_neon_intr.c | 3 ++- celt/arm/pitch_arm.h | 24 +++++++++--------------- celt/pitch.c | 13 ------------- celt/pitch.h | 11 ++--------- 5 files changed, 15 insertions(+), 40 deletions(-) diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index 1d53d62..6e28c70 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -51,7 +51,7 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \ (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP))) opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, - const opus_val16 *, opus_val32 *, int , int) = { + const opus_val16 *, opus_val32 *, int, int, int) = { celt_pitch_xcorr_c, /* ARMv4 */ MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ @@ -62,7 +62,7 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, # else /* !FIXED_POINT */ # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, - const opus_val16 *, opus_val32 *, int, int) = { + const opus_val16 *, opus_val32 *, int, int, int) = { celt_pitch_xcorr_c, /* ARMv4 */ celt_pitch_xcorr_c, /* EDSP */ celt_pitch_xcorr_c, /* Media */ diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c index 47bbe3d..f8f1d43 100644 --- a/celt/arm/celt_neon_intr.c +++ b/celt/arm/celt_neon_intr.c @@ -290,8 +290,9 @@ static void xcorr_kernel_neon_float_process1(const float32_t *x, } void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch) { + opus_val32 *xcorr, int len, int max_pitch, int arch) { int i; + (void)arch; celt_assert(max_pitch > 0); celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h index 1433116..d8b022e 100644 --- a/celt/arm/pitch_arm.h +++ b/celt/arm/pitch_arm.h @@ -34,7 +34,7 @@ # if defined(OPUS_ARM_MAY_HAVE_NEON) opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch); + opus_val32 *xcorr, int len, int max_pitch, int arch); # endif # if defined(OPUS_ARM_MAY_HAVE_MEDIA) @@ -43,7 +43,7 @@ opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y, # if defined(OPUS_ARM_MAY_HAVE_EDSP) opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch); + opus_val32 *xcorr, int len, int max_pitch, int arch); # endif # if defined(OPUS_HAVE_RTCD) && \ @@ -52,18 +52,15 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP))) extern opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, - const opus_val16 *, opus_val32 *, int, int); + const opus_val16 *, opus_val32 *, int, int, int); # define OVERRIDE_PITCH_XCORR (1) -# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ - ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ - xcorr, len, max_pitch)) +# define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK]) # elif defined(OPUS_ARM_PRESUME_EDSP) || \ defined(OPUS_ARM_PRESUME_MEDIA) || \ defined(OPUS_ARM_PRESUME_NEON) # define OVERRIDE_PITCH_XCORR (1) -# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ - ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) +# define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr)) # endif @@ -99,25 +96,22 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( /* Float case */ #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch); + opus_val32 *xcorr, int len, int max_pitch, int arch); #endif # if defined(OPUS_HAVE_RTCD) && \ (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) extern void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, - const opus_val16 *, opus_val32 *, int, int); + const opus_val16 *, opus_val32 *, int, int, int); # define OVERRIDE_PITCH_XCORR (1) -# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ - ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ - xcorr, len, max_pitch)) +# define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK]) # elif defined(OPUS_ARM_PRESUME_NEON_INTR) # define OVERRIDE_PITCH_XCORR (1) -# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ - ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch)) +# define celt_pitch_xcorr celt_pitch_xcorr_float_neon # endif diff --git a/celt/pitch.c b/celt/pitch.c index f944a33..874929e 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -220,13 +220,8 @@ opus_val32 #else void #endif -#if defined(OVERRIDE_PITCH_XCORR) celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch) -#else -celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch, int arch) -#endif { #if 0 /* This is a simple version of the pitch correlation that should work @@ -265,11 +260,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, for (i=0;i<max_pitch-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; -#if defined(OVERRIDE_PITCH_XCORR) - xcorr_kernel_c(_x, _y+i, sum, len); -#else xcorr_kernel(_x, _y+i, sum, len, arch); -#endif xcorr[i]=sum[0]; xcorr[i+1]=sum[1]; xcorr[i+2]=sum[2]; @@ -285,11 +276,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, for (;i<max_pitch;i++) { opus_val32 sum; -#if defined(OVERRIDE_PITCH_XCORR) - sum = celt_inner_prod_c(_x, _y+i, len); -#else sum = celt_inner_prod(_x, _y+i, len, arch); -#endif xcorr[i] = sum; #ifdef FIXED_POINT maxcorr = MAX32(maxcorr, sum); diff --git a/celt/pitch.h b/celt/pitch.h index d350353..d797844 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -184,17 +184,10 @@ opus_val32 void #endif celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch); - -#if !defined(OVERRIDE_PITCH_XCORR) -#ifdef FIXED_POINT -opus_val32 -#else -void -#endif -celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch, int arch); +#ifndef OVERRIDE_PITCH_XCORR +# define celt_pitch_xcorr celt_pitch_xcorr_c #endif #endif -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 14/15] Optimize celt_inner_prod() and dual_inner_prod() for ARM NEON
Created corresponding unit test. The fixed-point optimizations are bit exact with C functions. The floating-point optimizations are not bit exact with C functions, because of the order changes of floating-point operations. But they are bit exact with the simulation C functions which stimulate the floating operations in the optimizations. --- celt/arm/arm_celt_map.c | 17 ++ celt/arm/pitch_arm.h | 36 ++++ celt/arm/pitch_neon_intr.c | 179 ++++++++++++++++++++ celt/pitch.h | 3 +- celt/tests/test_unit_dft.c | 1 + celt/tests/test_unit_mathops.c | 1 + celt/tests/test_unit_mdct.c | 1 + celt/tests/test_unit_optimization_pitch.c | 263 ++++++++++++++++++++++++++++++ celt/tests/test_unit_rotation.c | 1 + celt/x86/pitch_sse.h | 5 +- celt_sources.mk | 3 +- tests/test_unit_optimization.c | 4 + 12 files changed, 508 insertions(+), 6 deletions(-) create mode 100644 celt/arm/pitch_neon_intr.c create mode 100644 celt/tests/test_unit_optimization_pitch.c diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index 6e28c70..a1a553a 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -36,6 +36,23 @@ #if defined(OPUS_HAVE_RTCD) +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = { + celt_inner_prod_c, /* ARMv4 */ + celt_inner_prod_c, /* EDSP */ + celt_inner_prod_c, /* Media */ + MAY_HAVE_NEON(celt_inner_prod) /* NEON */ +}; + +void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) = { + dual_inner_prod_c, /* ARMv4 */ + dual_inner_prod_c, /* EDSP */ + dual_inner_prod_c, /* Media */ + MAY_HAVE_NEON(dual_inner_prod) /* NEON */ +}; +# endif + # if defined(FIXED_POINT) # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h index d8b022e..d1a8db0 100644 --- a/celt/arm/pitch_arm.h +++ b/celt/arm/pitch_arm.h @@ -30,6 +30,42 @@ # include "armcpu.h" +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N); +void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, + const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_CELT_INNER_PROD (1) +# define OVERRIDE_DUAL_INNER_PROD (1) +# define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N)) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2)) +# endif + +# if !defined(OVERRIDE_CELT_INNER_PROD) +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N); +# define OVERRIDE_CELT_INNER_PROD (1) +# define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_CELT_INNER_PROD (1) +# define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N)) +# endif +# endif + +# if !defined(OVERRIDE_DUAL_INNER_PROD) +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, + const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2); +# define OVERRIDE_DUAL_INNER_PROD (1) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_DUAL_INNER_PROD (1) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2)) +# endif +# endif + # if defined(FIXED_POINT) # if defined(OPUS_ARM_MAY_HAVE_NEON) diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c new file mode 100644 index 0000000..2bda6e1 --- /dev/null +++ b/celt/arm/pitch_neon_intr.c @@ -0,0 +1,179 @@ +/* Copyright (c) 2016 Google Inc. */ +/** + @file pitch_neon_intr.c + @brief ARM Neon Intrinsic optimizations for celt pitch functions + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <arm_neon.h> +#include "pitch.h" + +opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) +{ + int i; + opus_val32 xy; + +#ifdef FIXED_POINT + int16x8_t x_s16x8, y_s16x8; + int32x4_t xy_s32x4 = vdupq_n_s32(0); + int64x2_t xy_s64x2; + int64x1_t xy_s64x1; + + for (i = 0; i < N - 7; i += 8) { + x_s16x8 = vld1q_s16(&x[i]); + y_s16x8 = vld1q_s16(&y[i]); + xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8)); + xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8)); + } + + if (N - i >= 4) { + const int16x4_t x_s16x4 = vld1_s16(&x[i]); + const int16x4_t y_s16x4 = vld1_s16(&y[i]); + xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4); + i += 4; + } + + xy_s64x2 = vpaddlq_s32(xy_s32x4); + xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2)); + xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0); +#else + float32x4_t xy_f32x4 = vdupq_n_f32(0); + float32x2_t xy_f32x2; + + for (i = 0; i < N - 7; i += 8) { + float32x4_t x_f32x4, y_f32x4; + x_f32x4 = vld1q_f32(&x[i]); + y_f32x4 = vld1q_f32(&y[i]); + xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); + x_f32x4 = vld1q_f32(&x[i + 4]); + y_f32x4 = vld1q_f32(&y[i + 4]); + xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); + } + + if (N - i >= 4) { + const float32x4_t x_f32x4 = vld1q_f32(&x[i]); + const float32x4_t y_f32x4 = vld1q_f32(&y[i]); + xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); + i += 4; + } + + xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4)); + xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2); + xy = vget_lane_f32(xy_f32x2, 0); +#endif + + for (; i < N; i++) { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + +void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) +{ + int i; + opus_val32 xy01, xy02; + +#ifdef FIXED_POINT + int16x8_t x_s16x8, y01_s16x8, y02_s16x8; + int32x4_t xy01_s32x4 = vdupq_n_s32(0); + int32x4_t xy02_s32x4 = vdupq_n_s32(0); + int64x2_t xy01_s64x2, xy02_s64x2; + int64x1_t xy01_s64x1, xy02_s64x1; + + for (i = 0; i < N - 7; i += 8) { + x_s16x8 = vld1q_s16(&x[i]); + y01_s16x8 = vld1q_s16(&y01[i]); + y02_s16x8 = vld1q_s16(&y02[i]); + xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8)); + xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8)); + xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8)); + xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8)); + } + + if (N - i >= 4) { + const int16x4_t x_s16x4 = vld1_s16(&x[i]); + const int16x4_t y01_s16x4 = vld1_s16(&y01[i]); + const int16x4_t y02_s16x4 = vld1_s16(&y02[i]); + xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4); + xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4); + i += 4; + } + + xy01_s64x2 = vpaddlq_s32(xy01_s32x4); + xy02_s64x2 = vpaddlq_s32(xy02_s32x4); + xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2)); + xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2)); + xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0); + xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0); +#else + float32x4_t xy01_f32x4 = vdupq_n_f32(0); + float32x4_t xy02_f32x4 = vdupq_n_f32(0); + float32x2_t xy01_f32x2, xy02_f32x2; + + for (i = 0; i < N - 7; i += 8) { + float32x4_t x_f32x4, y01_f32x4, y02_f32x4; + x_f32x4 = vld1q_f32(&x[i]); + y01_f32x4 = vld1q_f32(&y01[i]); + y02_f32x4 = vld1q_f32(&y02[i]); + xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); + xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); + x_f32x4 = vld1q_f32(&x[i + 4]); + y01_f32x4 = vld1q_f32(&y01[i + 4]); + y02_f32x4 = vld1q_f32(&y02[i + 4]); + xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); + xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); + } + + if (N - i >= 4) { + const float32x4_t x_f32x4 = vld1q_f32(&x[i]); + const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]); + const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]); + xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); + xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); + i += 4; + } + + xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4)); + xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4)); + xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2); + xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2); + xy01 = vget_lane_f32(xy01_f32x2, 0); + xy02 = vget_lane_f32(xy02_f32x2, 0); +#endif + + for (; i < N; i++) { + xy01 = MAC16_16(xy01, x[i], y01[i]); + xy02 = MAC16_16(xy02, x[i], y02[i]); + } + *xy1 = xy01; + *xy2 = xy02; +} diff --git a/celt/pitch.h b/celt/pitch.h index d797844..e425f56 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -46,8 +46,7 @@ #include "mips/pitch_mipsr1.h" #endif -#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ - || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) # include "arm/pitch_arm.h" #endif diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c index 582618e..02904bf 100644 --- a/celt/tests/test_unit_dft.c +++ b/celt/tests/test_unit_dft.c @@ -54,6 +54,7 @@ # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" +# include "arm/pitch_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "mdct.c" # include "arm/celt_ne10_fft.c" diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index f5af994..524c1f8 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -69,6 +69,7 @@ # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" +# include "arm/pitch_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "kiss_fft.c" # include "mdct.c" diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index 0658c7a..3b28767 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -55,6 +55,7 @@ # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" +# include "arm/pitch_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "arm/celt_ne10_fft.c" # include "arm/celt_ne10_mdct.c" diff --git a/celt/tests/test_unit_optimization_pitch.c b/celt/tests/test_unit_optimization_pitch.c new file mode 100644 index 0000000..64bb2a9 --- /dev/null +++ b/celt/tests/test_unit_optimization_pitch.c @@ -0,0 +1,263 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <stdio.h> +#include <string.h> + +#include "modes.h" +#include "pitch.h" + +#define MAX_LEN_INNER_PROD 960 + +#ifndef UNIT_TEST_CELT_INNER_PROD +#define UNIT_TEST_CELT_INNER_PROD + +static inline float rand_float(float min, float max) +{ + return ((max - min) * ((float)rand() / RAND_MAX)) + min; +} + +static OPUS_INLINE opus_val16 rand_val16(opus_val16 min, opus_val16 max) +{ +#ifdef FIXED_POINT + (void)min; + (void)max; + return rand(); +#else + return rand_float(min, max); +#endif +} + +static OPUS_INLINE void init_val16_buffer(opus_val16* buffer, int num) +{ + const opus_val16 min = (opus_val16)-1e10; + const opus_val16 max = (opus_val16) 1e10; + + for (int i = 0; i < num; i++) { + buffer[i] = rand_val16(min, max); + } +} + +#endif + +/* ========================================================================== */ +/* This part of code simulates floating-point operations. */ + +#ifndef FIXED_POINT + +/* celt_inner_prod_float_simulation_sse() simulates the floating operations of + * celt_inner_prod_sse(), and both functions should have bit exact output. + */ +opus_val32 celt_inner_prod_float_simulation_sse(const opus_val16 *x, + const opus_val16 *y, int N) +{ + int i; + opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; + for (i = 0; i < N - 3; i += 4) { + xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); + xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); + xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); + xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); + } + xy0 += xy2; + xy1 += xy3; + xy = xy0 + xy1; + for (; i < N; i++) { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + +/* dual_inner_prod_float_simulation_sse() simulates the floating-point operations + * of dual_inner_prod_sse(), and both functions should have bit exact output. + */ +void dual_inner_prod_float_simulation_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) +{ + int i; + opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0; + for (i = 0; i < N - 3; i += 4) { + xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]); + xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]); + xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]); + xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]); + xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]); + xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]); + xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]); + xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]); + } + xy01_0 += xy01_2; + xy02_0 += xy02_2; + xy01_1 += xy01_3; + xy02_1 += xy02_3; + xy01 = xy01_0 + xy01_1; + xy02 = xy02_0 + xy02_1; + for (; i < N; i++) { + xy01 = MAC16_16(xy01, x[i], y01[i]); + xy02 = MAC16_16(xy02, x[i], y02[i]); + } + *xy1 = xy01; + *xy2 = xy02; +} + +# define celt_inner_prod_float_simulation_c celt_inner_prod_c +# define dual_inner_prod_float_simulation_c dual_inner_prod_c + +/* Reuse since NEON optimizations happen to have the same simulated floating-point operations as SSE optimization. */ +# define celt_inner_prod_float_simulation_neon celt_inner_prod_float_simulation_sse +# define dual_inner_prod_float_simulation_neon dual_inner_prod_float_simulation_sse + +# ifdef OPUS_X86_MAY_HAVE_SSE +# define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1) +# define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1) +# ifdef OPUS_X86_PRESUME_SSE +# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), celt_inner_prod_float_simulation_sse(x, y, N)) +# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_float_simulation_sse(x, y01, y02, N, xy1, xy2)) +# else +# define celt_inner_prod_float_simulation(x, y, N, arch) ((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) +# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) +opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y, int N) = { + celt_inner_prod_float_simulation_c, /* non-sse */ + MAY_HAVE_SSE(celt_inner_prod_float_simulation), + MAY_HAVE_SSE(celt_inner_prod_float_simulation), + MAY_HAVE_SSE(celt_inner_prod_float_simulation), + MAY_HAVE_SSE(celt_inner_prod_float_simulation) +}; +void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) = { + dual_inner_prod_float_simulation_c, /* non-sse */ + MAY_HAVE_SSE(dual_inner_prod_float_simulation), + MAY_HAVE_SSE(dual_inner_prod_float_simulation), + MAY_HAVE_SSE(dual_inner_prod_float_simulation), + MAY_HAVE_SSE(dual_inner_prod_float_simulation) +}; +# endif /* !defined(OPUS_X86_PRESUME_SSE) */ +# endif /* OPUS_X86_MAY_HAVE_SSE */ + +# ifdef OPUS_ARM_MAY_HAVE_NEON_INTR +# define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1) +# define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1) +# ifndef OPUS_HAVE_RTCD +# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod_float_simulation)(x, y, N)) +# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod_float_simulation)(x, y01, y02, N, xy1, xy2)) +# else +# ifdef OPUS_ARM_PRESUME_NEON_INTR +# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), celt_inner_prod_float_simulation_neon(x, y, N)) +# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_float_simulation_neon(x, y01, y02, N, xy1, xy2)) +# else +# define celt_inner_prod_float_simulation(x, y, N, arch) ((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) +# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) +opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y, int N) = { + celt_inner_prod_float_simulation_c, /* ARMv4 */ + celt_inner_prod_float_simulation_c, /* EDSP */ + celt_inner_prod_float_simulation_c, /* Media */ + MAY_HAVE_NEON(celt_inner_prod_float_simulation) /* NEON */ +}; +void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) = { + dual_inner_prod_float_simulation_c, /* ARMv4 */ + dual_inner_prod_float_simulation_c, /* EDSP */ + dual_inner_prod_float_simulation_c, /* Media */ + MAY_HAVE_NEON(dual_inner_prod_float_simulation) /* NEON */ +}; +# endif /* !defined(OPUS_ARM_PRESUME_NEON_INTR) */ +# endif /* OPUS_HAVE_RTCD */ +# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */ + +# ifndef OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION +# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),celt_inner_prod_float_simulation_c(x, y, N)) +# endif + +# ifndef OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION +# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),dual_inner_prod_float_simulation_c(x, y01, y02, N, xy1, xy2)) +# endif + +#endif /* !FIXED_POINT */ + +/* ========================================================================== */ + +static int test_celt_inner_prod(int arch) +{ + opus_val16 x[MAX_LEN_INNER_PROD], y[MAX_LEN_INNER_PROD]; + opus_val32 xy_org, xy_opt; + int N; + + printf("%44s() ...", __func__); + init_val16_buffer(x, MAX_LEN_INNER_PROD); + init_val16_buffer(y, MAX_LEN_INNER_PROD); + for (N = 0; N <= MAX_LEN_INNER_PROD; N++) { +#ifdef FIXED_POINT + xy_org = celt_inner_prod_c(x, y, N); +#else + xy_org = celt_inner_prod_float_simulation(x, y, N, arch); +#endif + xy_opt = celt_inner_prod(x, y, N, arch); + if (xy_org != xy_opt) { +#ifdef FIXED_POINT + printf("\nN=%d xy_org = %d, xy_opt = %d failed!", N, xy_org, xy_opt); +#else + printf("\nN=%d xy_org = %f, xy_opt = %f failed!", N, xy_org, xy_opt); +#endif + return -1; + } + } + printf(" passed!\n"); + return 0; +} + +static int test_dual_inner_prod(int arch) +{ + opus_val16 x[MAX_LEN_INNER_PROD], y01[MAX_LEN_INNER_PROD], y02[MAX_LEN_INNER_PROD]; + opus_val32 xy1_org, xy1_opt, xy2_org, xy2_opt; + int N; + + printf("%44s() ...", __func__); + init_val16_buffer(x, MAX_LEN_INNER_PROD); + init_val16_buffer(y01, MAX_LEN_INNER_PROD); + init_val16_buffer(y02, MAX_LEN_INNER_PROD); + for (N = 0; N <= MAX_LEN_INNER_PROD; N++) { +#ifdef FIXED_POINT + dual_inner_prod_c(x, y01, y02, N, &xy1_org, &xy2_org); +#else + dual_inner_prod_float_simulation(x, y01, y02, N, &xy1_org, &xy2_org, arch); +#endif + dual_inner_prod(x, y01, y02, N, &xy1_opt, &xy2_opt, arch); + if ((xy1_org != xy1_opt) || (xy2_org != xy2_opt)) { +#ifdef FIXED_POINT + printf("\nN=%d xy1_org = %d, xy1_opt = %d failed!", N, xy1_org, xy1_opt); + printf("\nN=%d xy2_org = %d, xy2_opt = %d failed!", N, xy2_org, xy2_opt); +#else + printf("\nN=%d xy1_org = %f, xy1_opt = %f failed!", N, xy1_org, xy1_opt); + printf("\nN=%d xy2_org = %f, xy2_opt = %f failed!", N, xy2_org, xy2_opt); +#endif + return -1; + } + } + printf(" passed!\n"); + return 0; +} diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 785b40d..0b73839 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -67,6 +67,7 @@ # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" +# include "arm/pitch_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "kiss_fft.c" # include "mdct.c" diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index e5f87ab..5e85599 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -91,7 +91,7 @@ opus_val32 celt_inner_prod_sse2( int N); #endif -#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) opus_val32 celt_inner_prod_sse( const opus_val16 *x, const opus_val16 *y, @@ -104,7 +104,7 @@ opus_val32 celt_inner_prod_sse( #define celt_inner_prod(x, y, N, arch) \ ((void)arch, celt_inner_prod_sse4_1(x, y, N)) -#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) #define OVERRIDE_CELT_INNER_PROD #define celt_inner_prod(x, y, N, arch) \ ((void)arch, celt_inner_prod_sse2(x, y, N)) @@ -114,7 +114,6 @@ opus_val32 celt_inner_prod_sse( #define celt_inner_prod(x, y, N, arch) \ ((void)arch, celt_inner_prod_sse(x, y, N)) - #elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) diff --git a/celt_sources.mk b/celt_sources.mk index c4bd285..dc107d9 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -38,7 +38,8 @@ celt/arm/armopts.s.in CELT_SOURCES_ARM_NEON_INTR = \ celt/arm/celt_lpc_neon_intr.c \ -celt/arm/celt_neon_intr.c +celt/arm/celt_neon_intr.c \ +celt/arm/pitch_neon_intr.c CELT_SOURCES_ARM_NE10= \ celt/arm/celt_ne10_fft.c \ diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c index 6155dfb..a88ac21 100644 --- a/tests/test_unit_optimization.c +++ b/tests/test_unit_optimization.c @@ -71,6 +71,7 @@ # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "celt/arm/celt_lpc_neon_intr.c" # include "celt/arm/celt_neon_intr.c" +# include "celt/arm/pitch_neon_intr.c" # include "silk/arm/biquad_alt_neon_intr.c" # include "silk/arm/inner_prod_aligned_neon_intr.c" # include "silk/arm/LPC_analysis_filter_neon_intr.c" @@ -94,6 +95,7 @@ #endif +# include "celt/tests/test_unit_optimization_pitch.c" # include "silk/tests/test_unit_optimization_biquad_alt.c" # include "silk/tests/test_unit_optimization_inner_prod_aligned.c" # include "silk/tests/test_unit_optimization_LPC_analysis_filter.c" @@ -118,6 +120,8 @@ int main(void) result |= test_silk_LPC_inverse_pred_gain_Q24(arch); result |= test_warped_autocorrelation(arch); #endif /* FIXED_POINT */ + result |= test_celt_inner_prod(arch); + result |= test_dual_inner_prod(arch); result |= test_silk_biquad_alt(arch); result |= test_silk_inner_prod_aligned_scale(arch); result |= test_silk_LPC_analysis_filter(arch); -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:03 UTC
[opus] [PATCH 15/15] Clean celt_pitch_xcorr_float_neon()
Call celt_inner_prod_neon() and remove redundant code. --- celt/arm/celt_neon_intr.c | 105 +--------------------------------------------- 1 file changed, 2 insertions(+), 103 deletions(-) diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c index f8f1d43..cf44398 100644 --- a/celt/arm/celt_neon_intr.c +++ b/celt/arm/celt_neon_intr.c @@ -191,104 +191,6 @@ static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y, vst1q_f32(sum, SUMM); } -/* - * Function: xcorr_kernel_neon_float_process1 - * --------------------------------- - * Computes single correlation values and stores in *sum - */ -static void xcorr_kernel_neon_float_process1(const float32_t *x, - const float32_t *y, float32_t *sum, int len) { - float32x4_t XX[4]; - float32x4_t YY[4]; - float32x2_t XX_2; - float32x2_t YY_2; - float32x4_t SUMM; - float32x2_t SUMM_2[2]; - const float32_t *xi = x; - const float32_t *yi = y; - - SUMM = vdupq_n_f32(0); - - /* Work on 16 values per iteration */ - while (len >= 16) { - XX[0] = vld1q_f32(xi); - xi += 4; - XX[1] = vld1q_f32(xi); - xi += 4; - XX[2] = vld1q_f32(xi); - xi += 4; - XX[3] = vld1q_f32(xi); - xi += 4; - - YY[0] = vld1q_f32(yi); - yi += 4; - YY[1] = vld1q_f32(yi); - yi += 4; - YY[2] = vld1q_f32(yi); - yi += 4; - YY[3] = vld1q_f32(yi); - yi += 4; - - SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); - SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); - SUMM = vmlaq_f32(SUMM, YY[2], XX[2]); - SUMM = vmlaq_f32(SUMM, YY[3], XX[3]); - len -= 16; - } - - /* Work on 8 values */ - if (len >= 8) { - XX[0] = vld1q_f32(xi); - xi += 4; - XX[1] = vld1q_f32(xi); - xi += 4; - - YY[0] = vld1q_f32(yi); - yi += 4; - YY[1] = vld1q_f32(yi); - yi += 4; - - SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); - SUMM = vmlaq_f32(SUMM, YY[1], XX[1]); - len -= 8; - } - - /* Work on 4 values */ - if (len >= 4) { - XX[0] = vld1q_f32(xi); - xi += 4; - YY[0] = vld1q_f32(yi); - yi += 4; - SUMM = vmlaq_f32(SUMM, YY[0], XX[0]); - len -= 4; - } - - /* Start accumulating results */ - SUMM_2[0] = vget_low_f32(SUMM); - if (len >= 2) { - /* While at it, consume 2 more values if available */ - XX_2 = vld1_f32(xi); - xi += 2; - YY_2 = vld1_f32(yi); - yi += 2; - SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2); - len -= 2; - } - SUMM_2[1] = vget_high_f32(SUMM); - SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]); - SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]); - /* Ok, now we have result accumulated in SUMM_2[0].0 */ - - if (len > 0) { - /* Case when you have one value left */ - XX_2 = vld1_dup_f32(xi); - YY_2 = vld1_dup_f32(yi); - SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2); - } - - vst1_lane_f32(sum, SUMM_2[0], 0); -} - void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch, int arch) { int i; @@ -301,12 +203,9 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, (float32_t *)xcorr+i, len); } - /* In case max_pitch isn't multiple of 4 - * compute single correlation value per iteration - */ + /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */ for (; i < max_pitch; i++) { - xcorr_kernel_neon_float_process1((const float32_t *)_x, - (const float32_t *)_y+i, (float32_t *)xcorr+i, len); + xcorr[i] = celt_inner_prod_neon(_x, _y+i, len); } } #endif -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Sep-13 00:17 UTC
[opus] [PATCH 12/15] Replace call of celt_inner_prod_c() (step 1)
I'm attaching the previous 11 NEON optimization patches in a zip file, which is the same as the one I sent on September 6th in reply to thread "[PATCH 9/9] Optimize silk_inner_prod_aligned_scale() for ARM NEON". Thanks, Linfeng On Mon, Sep 12, 2016 at 5:03 PM, Linfeng Zhang <linfengz at google.com> wrote:> Should call celt_inner_prod(). > --- > celt/bands.c | 7 ++++--- > celt/bands.h | 2 +- > celt/celt_encoder.c | 6 +++--- > celt/pitch.c | 2 +- > src/opus_multistream_encoder.c | 2 +- > 5 files changed, 10 insertions(+), 9 deletions(-) > > diff --git a/celt/bands.c b/celt/bands.c > index bbe8a4c..1ab24aa 100644 > --- a/celt/bands.c > +++ b/celt/bands.c > @@ -92,10 +92,11 @@ static int bitexact_log2tan(int isin,int icos) > > #ifdef FIXED_POINT > /* Compute the amplitude (sqrt energy) in each of the bands */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM) > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch) > { > int i, c, N; > const opus_int16 *eBands = m->eBands; > + (void)arch; > N = m->shortMdctSize<<LM; > c=0; do { > for (i=0;i<end;i++) > @@ -155,7 +156,7 @@ void normalise_bands(const CELTMode *m, const celt_sig > * OPUS_RESTRICT freq, cel > > #else /* FIXED_POINT */ > /* Compute the amplitude (sqrt energy) in each of the bands */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM) > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch) > { > int i, c, N; > const opus_int16 *eBands = m->eBands; > @@ -164,7 +165,7 @@ void compute_band_energies(const CELTMode *m, const > celt_sig *X, celt_ener *band > for (i=0;i<end;i++) > { > opus_val32 sum; > - sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], > &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM); > + sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], > &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch); > bandE[i+c*m->nbEBands] = celt_sqrt(sum); > /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ > } > diff --git a/celt/bands.h b/celt/bands.h > index c040c7f..61ae0cd 100644 > --- a/celt/bands.h > +++ b/celt/bands.h > @@ -41,7 +41,7 @@ > * @param X Spectrum > * @param bandE Square root of the energy for each band (returned) > */ > -void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM); > +void compute_band_energies(const CELTMode *m, const celt_sig *X, > celt_ener *bandE, int end, int C, int LM, int arch); > > /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const > opus_val16 *tonality, celt_ener *bandE);*/ > > diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c > index 5be7610..8af61d5 100644 > --- a/celt/celt_encoder.c > +++ b/celt/celt_encoder.c > @@ -1606,7 +1606,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > if (secondMdct) > { > compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); > amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); > for (i=0;i<C*nbEBands;i++) > bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); > @@ -1615,7 +1615,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, > st->arch); > if (CC==2&&C==1) > tf_chan = 0; > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); > > if (st->lfe) > { > @@ -1739,7 +1739,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT > st, const opus_val16 * pcm, > isTransient = 1; > shortBlocks = M; > compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, > st->upsample, st->arch); > - compute_band_energies(mode, freq, bandE, effEnd, C, LM); > + compute_band_energies(mode, freq, bandE, effEnd, C, LM, > st->arch); > amp2Log2(mode, effEnd, end, bandE, bandLogE, C); > /* Compensate for the scaling of short vs long mdcts */ > for (i=0;i<C*nbEBands;i++) > diff --git a/celt/pitch.c b/celt/pitch.c > index bf46e7d..f944a33 100644 > --- a/celt/pitch.c > +++ b/celt/pitch.c > @@ -378,7 +378,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT > x_lp, opus_val16 * OPUS_RESTR > for (j=0;j<len>>1;j++) > sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); > #else > - sum = celt_inner_prod_c(x_lp, y+i, len>>1); > + sum = celt_inner_prod(x_lp, y+i, len>>1, arch); > #endif > xcorr[i] = MAX32(-1, sum); > #ifdef FIXED_POINT > diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_ > encoder.c > index c07132f..6ecea5d 100644 > --- a/src/opus_multistream_encoder.c > +++ b/src/opus_multistream_encoder.c > @@ -295,7 +295,7 @@ void surround_analysis(const CELTMode *celt_mode, > const void *pcm, opus_val16 *b > freq[i] = 0; > } > > - compute_band_energies(celt_mode, freq, bandE, 21, 1, LM); > + compute_band_energies(celt_mode, freq, bandE, 21, 1, LM, arch); > amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1); > /* Apply spreading function with -6 dB/band going up and -12 > dB/band going down. */ > for (i=1;i<21;i++) > -- > 2.8.0.rc3.226.g39d4020 > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.xiph.org/pipermail/opus/attachments/20160912/0434e43f/attachment-0001.html> -------------- next part -------------- A non-text attachment was scrubbed... Name: opus-NEON-11-patches.zip Type: application/zip Size: 71375 bytes Desc: not available URL: <http://lists.xiph.org/pipermail/opus/attachments/20160912/0434e43f/attachment-0001.zip>