Viswanath Puttagunta
2015-Mar-12 17:45 UTC
[opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.
From: Jonathan Lennox <jonathan at vidyo.com> * Makes ?enable-intrinsics work with clang and other non-GCC compilers * Enables RTCD for the floating-point-mode SSE code in Celt. * Disables use of RTCD in cases where the compiler targets an instruction set by default. * Enables the SSE4.1 Silk optimizations that apply to the common parts of Silk when Opus is built in floating-point mode, not just in fixed-point mode. * Enables the SSE intrinsics (with RTCD when appropriate) in the Win32 build. * Fixes a case where GCC would compile SSE2 code as SSE4.1, causing a crash on non-SSE4.1 CPUs. * Allows configuration with compilers with non-GCC-flavor flags for enabling architecture options. * Hopefully makes the configuration and ifdef?s easier to follow and understand. Reviewed-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org> --- Makefile.am | 38 ++-- celt/arm/armcpu.c | 6 +- celt/arm/pitch_arm.h | 4 +- celt/bands.c | 6 +- celt/celt.c | 16 +- celt/celt.h | 12 +- celt/celt_decoder.c | 6 +- celt/celt_encoder.c | 4 +- celt/celt_lpc.h | 2 +- celt/cpu_support.h | 15 +- celt/mips/celt_mipsr1.h | 2 +- celt/pitch.c | 4 +- celt/pitch.h | 19 +- celt/tests/test_unit_dft.c | 4 +- celt/tests/test_unit_mathops.c | 11 +- celt/tests/test_unit_mdct.c | 4 +- celt/tests/test_unit_rotation.c | 11 +- celt/x86/celt_lpc_sse.c | 4 + celt/x86/celt_lpc_sse.h | 12 +- celt/x86/pitch_sse.c | 334 +++++++++++++------------------ celt/x86/pitch_sse.h | 256 ++++++++++------------- celt/x86/pitch_sse2.c | 95 +++++++++ celt/x86/pitch_sse4_1.c | 195 ++++++++++++++++++ celt/x86/x86_celt_map.c | 76 ++++++- celt/x86/x86cpu.c | 47 ++++- celt/x86/x86cpu.h | 26 ++- celt_sources.mk | 5 +- configure.ac | 312 ++++++++++++++++++----------- m4/opus-intrinsics.m4 | 29 +++ silk/x86/SigProc_FIX_sse.h | 17 ++ silk/x86/main_sse.h | 48 +++++ silk/x86/x86_silk_map.c | 25 ++- win32/VS2010/celt.vcxproj | 17 +- win32/VS2010/celt.vcxproj.filters | 27 +++ win32/VS2010/silk_common.vcxproj | 17 +- win32/VS2010/silk_common.vcxproj.filters | 23 ++- win32/VS2010/silk_fixed.vcxproj | 13 +- win32/VS2010/silk_fixed.vcxproj.filters | 17 +- win32/config.h | 25 ++- 39 files changed, 1210 insertions(+), 574 deletions(-) create mode 100644 celt/x86/pitch_sse2.c create mode 100644 celt/x86/pitch_sse4_1.c create mode 100644 m4/opus-intrinsics.m4 diff --git a/Makefile.am b/Makefile.am index c5c1562..3a75740 100644 --- a/Makefile.am +++ b/Makefile.am @@ -23,6 +23,9 @@ SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1) endif else SILK_SOURCES += $(SILK_SOURCES_FLOAT) +if HAVE_SSE4_1 +SILK_SOURCES += $(SILK_SOURCES_SSE4_1) +endif endif if DISABLE_FLOAT_API @@ -30,12 +33,14 @@ else OPUS_SOURCES += $(OPUS_SOURCES_FLOAT) endif -if HAVE_SSE4_1 -CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1) -else -if HAVE_SSE2 +if HAVE_SSE CELT_SOURCES += $(CELT_SOURCES_SSE) endif +if HAVE_SSE2 +CELT_SOURCES += $(CELT_SOURCES_SSE2) +endif +if HAVE_SSE4_1 +CELT_SOURCES += $(CELT_SOURCES_SSE4_1) endif if CPU_ARM @@ -44,7 +49,6 @@ SILK_SOURCES += $(SILK_SOURCES_ARM) if OPUS_ARM_NEON_INTR CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR) -OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon endif if HAVE_ARM_NE10 @@ -262,20 +266,30 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl %-gnu.S: %.s $(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@ -SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o +OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \ + $(celt_tests_test_unit_rotation_SOURCES:.c=.o) + +if HAVE_SSE +SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo) +$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS) +endif -if HAVE_SSE4_1 -$(SSE_OBJ): CFLAGS += -msse4.1 -else if HAVE_SSE2 -$(SSE_OBJ): CFLAGS += -msse2 +SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) +$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS) endif + +if HAVE_SSE4_1 +SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \ + $(SILK_SOURCES_SSE4_1:.c=.lo) \ + $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo) +$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) endif if OPUS_ARM_NEON_INTR CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ $(CELT_SOURCES_ARM_NE10:.c=.lo) \ - %test_unit_rotation.o %test_unit_mathops.o \ %test_unit_mdct.o %test_unit_dft.o -$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS) $(NE10_CFLAGS) + +$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS) endif diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c index 1768525..5e5d10c 100644 --- a/celt/arm/armcpu.c +++ b/celt/arm/armcpu.c @@ -73,7 +73,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(OPUS_ARM_MAY_HAVE_NEON) +# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) __try{ /*VORR q0,q0,q0*/ __emit(0xF2200150); @@ -107,7 +107,7 @@ opus_uint32 opus_cpu_capabilities(void) while(fgets(buf, 512, cpuinfo) != NULL) { -# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) /* Search for edsp and neon flag */ if(memcmp(buf, "Features", 8) == 0) { @@ -118,7 +118,7 @@ opus_uint32 opus_cpu_capabilities(void) flags |= OPUS_CPU_ARM_EDSP; # endif -# if defined(OPUS_ARM_MAY_HAVE_NEON) +# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) p = strstr(buf, " neon"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_NEON; diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h index 125d1bc..8626ed7 100644 --- a/celt/arm/pitch_arm.h +++ b/celt/arm/pitch_arm.h @@ -54,10 +54,10 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, #else /* Start !FIXED_POINT */ /* Float case */ -#if defined(OPUS_ARM_NEON_INTR) +#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch); -#if !defined(OPUS_HAVE_RTCD) +#if !defined(OPUS_HAVE_RTCD) || defined(OPUS_ARM_PRESUME_NEON_INTR) #define OVERRIDE_PITCH_XCORR (1) # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch)) diff --git a/celt/bands.c b/celt/bands.c index c643b09..25f229e 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -398,7 +398,7 @@ static void stereo_split(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT } } -static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N) +static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N, int arch) { int j; opus_val32 xp=0, side=0; @@ -410,7 +410,7 @@ static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT opus_val32 t, lgain, rgain; /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ - dual_inner_prod(Y, X, Y, N, &xp, &side); + dual_inner_prod(Y, X, Y, N, &xp, &side, arch); /* Compensating for the mid normalization */ xp = MULT16_32_Q15(mid, xp); /* mid and side are in Q15, not Q14 like X and Y */ @@ -1348,7 +1348,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm if (resynth) { if (N!=2) - stereo_merge(X, Y, mid, N); + stereo_merge(X, Y, mid, N, ctx->arch); if (inv) { int j; diff --git a/celt/celt.c b/celt/celt.c index a610de4..40c62ce 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -89,10 +89,12 @@ int resampling_factor(opus_int32 rate) return ret; } -#ifndef OVERRIDE_COMB_FILTER_CONST /* This version should be faster on ARM */ #ifdef OPUS_ARM_ASM -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, +#ifndef NON_STATIC_COMB_FILTER_CONST_C +static +#endif +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) { opus_val32 x0, x1, x2, x3, x4; @@ -147,7 +149,10 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, #endif } #else -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, +#ifndef NON_STATIC_COMB_FILTER_CONST_C +static +#endif +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) { opus_val32 x0, x1, x2, x3, x4; @@ -171,12 +176,11 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, } #endif -#endif #ifndef OVERRIDE_comb_filter void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap) + const opus_val16 *window, int overlap, int arch) { int i; /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */ @@ -234,7 +238,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, } /* Compute the part with the constant filter. */ - comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12); + comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch); } #endif /* OVERRIDE_comb_filter */ diff --git a/celt/celt.h b/celt/celt.h index b196751..a423b95 100644 --- a/celt/celt.h +++ b/celt/celt.h @@ -201,7 +201,17 @@ void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RES void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap); + const opus_val16 *window, int overlap, int arch); + +#ifdef NON_STATIC_COMB_FILTER_CONST_C +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12); +#endif + +#ifndef OVERRIDE_COMB_FILTER_CONST +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) +#endif void init_caps(const CELTMode *m,int *cap,int LM,int C); diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index 304f334..505a6ef 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -699,7 +699,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) comb_filter(etmp, buf+DECODE_BUFFER_SIZE, st->postfilter_period, st->postfilter_period, overlap, -st->postfilter_gain, -st->postfilter_gain, - st->postfilter_tapset, st->postfilter_tapset, NULL, 0); + st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); /* Simulate TDAC on the concealed audio so that it blends with the MDCT of the next frame. */ @@ -1011,11 +1011,11 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD); comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize, st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); if (LM!=0) comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize, st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); } while (++c<CC); st->postfilter_period_old = st->postfilter_period; diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 5f48638..1c9dbcb 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1166,11 +1166,11 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, if (offset) comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD, st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain, - st->prefilter_tapset, st->prefilter_tapset, NULL, 0); + st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch); comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset, st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1, - st->prefilter_tapset, prefilter_tapset, mode->window, overlap); + st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch); OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap); if (N>COMBFILTER_MAXPERIOD) diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index dc8967f..323459e 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -48,7 +48,7 @@ void celt_fir_c( opus_val16 *mem, int arch); -#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if !defined(OVERRIDE_CELT_FIR) #define celt_fir(x, num, y, N, ord, mem, arch) \ (celt_fir_c(x, num, y, N, ord, mem, arch)) #endif diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 1d62e2f..5e99a90 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -32,7 +32,8 @@ #include "opus_defines.h" #if defined(OPUS_HAVE_RTCD) && \ - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) + #include "arm/armcpu.h" /* We currently support 4 ARM variants: @@ -43,14 +44,16 @@ */ #define OPUS_ARCHMASK 3 -#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) #include "x86/x86cpu.h" -/* We currently support 3 x86 variants: +/* We currently support 4 x86 variants: * arch[0] -> non-sse - * arch[1] -> sse2 - * arch[2] -> sse4.1 - * arch[3] -> NULL + * arch[1] -> sse + * arch[2] -> sse2 + * arch[3] -> sse4.1 */ #define OPUS_ARCHMASK 3 int opus_select_arch(void); diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h index 03915d8..7915d59 100644 --- a/celt/mips/celt_mipsr1.h +++ b/celt/mips/celt_mipsr1.h @@ -56,7 +56,7 @@ #define OVERRIDE_comb_filter void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap) + const opus_val16 *window, int overlap, int arch) { int i; opus_val32 x0, x1, x2, x3, x4; diff --git a/celt/pitch.c b/celt/pitch.c index 4364703..1d89cb0 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -439,7 +439,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, T = T0 = *T0_; ALLOC(yy_lookup, maxperiod+1, opus_val32); - dual_inner_prod(x, x, x-T0, N, &xx, &xy); + dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch); yy_lookup[0] = xx; yy=xx; for (i=1;i<=maxperiod;i++) @@ -483,7 +483,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, { T1b = celt_udiv(2*second_check[k]*T0+k, 2*k); } - dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2); + dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch); xy += xy2; yy = yy_lookup[T1] + yy_lookup[T1b]; #ifdef FIXED_POINT diff --git a/celt/pitch.h b/celt/pitch.h index 4368cc5..af745eb 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -37,8 +37,8 @@ #include "modes.h" #include "cpu_support.h" -#if defined(__SSE__) && !defined(FIXED_POINT) \ - || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \ + || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) #include "x86/pitch_sse.h" #endif @@ -135,8 +135,7 @@ static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * #endif /* OVERRIDE_XCORR_KERNEL */ -#ifndef OVERRIDE_DUAL_INNER_PROD -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, +static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; @@ -150,6 +149,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y *xy1 = xy01; *xy2 = xy02; } + +#ifndef OVERRIDE_DUAL_INNER_PROD +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2)) #endif /*We make sure a C version is always available for cases where the overhead of @@ -169,6 +172,12 @@ static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x, ((void)(arch),celt_inner_prod_c(x, y, N)) #endif +#ifdef NON_STATIC_COMB_FILTER_CONST_C +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12); +#endif + + #ifdef FIXED_POINT opus_val32 #else @@ -180,7 +189,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, #if !defined(OVERRIDE_PITCH_XCORR) /*Is run-time CPU detection enabled on this platform?*/ # if defined(OPUS_HAVE_RTCD) && \ - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) + (defined(OPUS_ARM_ASM) || (defined(OPUS_ARM_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))) extern # if defined(FIXED_POINT) opus_val32 diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c index 84f69bd..57691c6 100644 --- a/celt/tests/test_unit_dft.c +++ b/celt/tests/test_unit_dft.c @@ -50,7 +50,7 @@ #include "entcode.c" #if defined(OPUS_HAVE_RTCD) && \ - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) #include "arm/armcpu.c" #if defined(HAVE_ARM_NE10) #include "arm/celt_ne10_fft.c" @@ -60,6 +60,8 @@ #include "arm/arm_celt_map.c" #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) #include "x86/x86cpu.c" +#include "celt/x86/pitch_sse.c" +#include "x86/x86_celt_map.c" #endif #ifndef M_PI diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index 0f1e4f1..379fbd5 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -49,12 +49,21 @@ #include "cwrs.c" #include "pitch.c" #include "celt_lpc.c" +#include "celt.c" #include "kiss_fft.c" #include "mdct.c" -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE) || \ + defined(OPUS_X86_MAY_HAVE_SSE2) || \ + defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE) #include "x86/pitch_sse.c" +#endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/pitch_sse2.c" +#endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/pitch_sse4_1.c" #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index c64cac2..d8c4ef0 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -49,7 +49,7 @@ #include "entcode.c" #if defined(OPUS_HAVE_RTCD) && \ - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) #include "arm/armcpu.c" #if defined(HAVE_ARM_NE10) #include "arm/celt_ne10_fft.c" @@ -60,6 +60,8 @@ #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) #include "x86/x86cpu.c" +#include "celt/x86/pitch_sse.c" +#include "x86/x86_celt_map.c" #endif #ifndef M_PI diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index ce14936..3cf54fa 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -46,13 +46,22 @@ #include "bands.h" #include "pitch.c" #include "celt_lpc.c" +#include "celt.c" #include "kiss_fft.c" #include "mdct.c" #include <math.h> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE) || \ + defined(OPUS_X86_MAY_HAVE_SSE2) || \ + defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE) #include "x86/pitch_sse.c" +#endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/pitch_sse2.c" +#endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/pitch_sse4_1.c" #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c index 9fb9779..67e5592 100644 --- a/celt/x86/celt_lpc_sse.c +++ b/celt/x86/celt_lpc_sse.c @@ -38,6 +38,8 @@ #include "pitch.h" #include "x86cpu.h" +#if defined(FIXED_POINT) + void celt_fir_sse4_1(const opus_val16 *_x, const opus_val16 *num, opus_val16 *_y, @@ -126,3 +128,5 @@ void celt_fir_sse4_1(const opus_val16 *_x, #endif RESTORE_STACK; } + +#endif diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index f111420..c5ec796 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -32,7 +32,9 @@ #include "config.h" #endif -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_FIR + void celt_fir_sse4_1( const opus_val16 *x, const opus_val16 *num, @@ -42,6 +44,12 @@ void celt_fir_sse4_1( opus_val16 *mem, int arch); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define celt_fir(x, num, y, N, ord, mem, arch) \ + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) + +#else + extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *num, @@ -56,3 +64,5 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( #endif #endif + +#endif diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c index e3bc6d7..20e7312 100644 --- a/celt/x86/pitch_sse.c +++ b/celt/x86/pitch_sse.c @@ -29,223 +29,157 @@ #include "config.h" #endif -#include <xmmintrin.h> -#include <emmintrin.h> - #include "macros.h" #include "celt_lpc.h" #include "stack_alloc.h" #include "mathops.h" #include "pitch.h" -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) -#include <smmintrin.h> -#include "x86cpu.h" - -opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, - int N) -{ - opus_int i, dataSize16; - opus_int32 sum; - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; - __m128i inVec1_3210, inVec2_3210; - - sum = 0; - dataSize16 = N & ~15; - - acc1 = _mm_setzero_si128(); - acc2 = _mm_setzero_si128(); - - for (i=0;i<dataSize16;i+=16) { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); - inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); - } +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) - acc1 = _mm_add_epi32(acc1, acc2); - - if (N - i >= 8) - { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - i += 8; - } - - if (N - i >= 4) - { - inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); - inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); - - inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); - - acc1 = _mm_add_epi32(acc1, inVec1_3210); - i += 4; - } - - acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); - acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); - - sum += _mm_cvtsi128_si32(acc1); - - for (;i<N;i++) - { - sum = silk_SMLABB(sum, x[i], y[i]); - } +#include <xmmintrin.h> +#include "arch.h" - return sum; +void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) +{ + int j; + __m128 xsum1, xsum2; + xsum1 = _mm_loadu_ps(sum); + xsum2 = _mm_setzero_ps(); + + for (j = 0; j < len-3; j += 4) + { + __m128 x0 = _mm_loadu_ps(x+j); + __m128 yj = _mm_loadu_ps(y+j); + __m128 y3 = _mm_loadu_ps(y+j+3); + + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), + _mm_shuffle_ps(yj,y3,0x49))); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), + _mm_shuffle_ps(yj,y3,0x9e))); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); + } + if (j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + } + } + } + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); } -void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) + +void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) { - int j; - - __m128i vecX, vecX0, vecX1, vecX2, vecX3; - __m128i vecY0, vecY1, vecY2, vecY3; - __m128i sum0, sum1, sum2, sum3, vecSum; - __m128i initSum; - - celt_assert(len >= 3); - - sum0 = _mm_setzero_si128(); - sum1 = _mm_setzero_si128(); - sum2 = _mm_setzero_si128(); - sum3 = _mm_setzero_si128(); - - for (j=0;j<(len-7);j+=8) - { - vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); - vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); - vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); - vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); - vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); - - sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); - sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); - sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); - } - - sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); - sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); - - sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); - sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); - - sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); - sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); - - sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); - sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); - - vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), - _mm_unpacklo_epi32(sum2, sum3)); - - for (;j<(len-3);j+=4) - { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); - vecX1 = _mm_shuffle_epi32(vecX, 0x55); - vecX2 = _mm_shuffle_epi32(vecX, 0xaa); - vecX3 = _mm_shuffle_epi32(vecX, 0xff); - - vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); - vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); - vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); - vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); - - sum0 = _mm_mullo_epi32(vecX0, vecY0); - sum1 = _mm_mullo_epi32(vecX1, vecY1); - sum2 = _mm_mullo_epi32(vecX2, vecY2); - sum3 = _mm_mullo_epi32(vecX3, vecY3); - - sum0 = _mm_add_epi32(sum0, sum1); - sum2 = _mm_add_epi32(sum2, sum3); - vecSum = _mm_add_epi32(vecSum, sum0); - vecSum = _mm_add_epi32(vecSum, sum2); - } - - for (;j<len;j++) - { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); - - vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); - - sum0 = _mm_mullo_epi32(vecX0, vecY0); - vecSum = _mm_add_epi32(vecSum, sum0); - } - - initSum = _mm_loadu_si128((__m128i *)(&sum[0])); - initSum = _mm_add_epi32(initSum, vecSum); - _mm_storeu_si128((__m128i *)sum, initSum); + int i; + __m128 xsum1, xsum2; + xsum1 = _mm_setzero_ps(); + xsum2 = _mm_setzero_ps(); + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 y1i = _mm_loadu_ps(y01+i); + __m128 y2i = _mm_loadu_ps(y02+i); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); + } + /* Horizontal sum */ + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); + _mm_store_ss(xy1, xsum1); + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); + _mm_store_ss(xy2, xsum2); + for (;i<N;i++) + { + *xy1 = MAC16_16(*xy1, x[i], y01[i]); + *xy2 = MAC16_16(*xy2, x[i], y02[i]); + } } -#endif -#if defined(OPUS_X86_MAY_HAVE_SSE2) -opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, +opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, int N) { - opus_int i, dataSize16; - opus_int32 sum; - - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; - - sum = 0; - dataSize16 = N & ~15; - - acc1 = _mm_setzero_si128(); - acc2 = _mm_setzero_si128(); - - for (i=0;i<dataSize16;i+=16) - { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); - inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); - } - - acc1 = _mm_add_epi32( acc1, acc2 ); - - if (N - i >= 8) - { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + int i; + float xy; + __m128 sum; + sum = _mm_setzero_ps(); + /* FIXME: We should probably go 8-way and use 2 sums. */ + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 yi = _mm_loadu_ps(y+i); + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); + } + /* Horizontal sum */ + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); + _mm_store_ss(&xy, sum); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - i += 8; - } +void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + int i; + __m128 x0v; + __m128 g10v, g11v, g12v; + g10v = _mm_load1_ps(&g10); + g11v = _mm_load1_ps(&g11); + g12v = _mm_load1_ps(&g12); + x0v = _mm_loadu_ps(&x[-T-2]); + for (i=0;i<N-3;i+=4) + { + __m128 yi, yi2, x1v, x2v, x3v, x4v; + const opus_val32 *xp = &x[i-T-2]; + yi = _mm_loadu_ps(x+i); + x4v = _mm_loadu_ps(xp+4); +#if 0 + /* Slower version with all loads */ + x1v = _mm_loadu_ps(xp+1); + x2v = _mm_loadu_ps(xp+2); + x3v = _mm_loadu_ps(xp+3); +#else + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); +#endif - acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); - acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); - sum += _mm_cvtsi128_si32(acc1); + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#else + /* Use partial sums */ + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + yi = _mm_add_ps(yi, yi2); +#endif + x0v=x4v; + _mm_storeu_ps(y+i, yi); + } +#ifdef CUSTOM_MODES + for (;i<N;i++) + { + y[i] = x[i] + + MULT16_32_Q15(g10,x[i-T]) + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); + } +#endif +} - for (;i<N;i++) { - sum = silk_SMLABB(sum, x[i], y[i]); - } - return sum; -} #endif diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 99d1919..cbe722c 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -37,17 +37,37 @@ #include "config.h" #endif -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) void xcorr_kernel_sse4_1( const opus_int16 *x, const opus_int16 *y, opus_val32 sum[4], int len); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) +void xcorr_kernel_sse( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); +#endif + +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len)) + +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse(x, y, sum, len)) + +#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( - const opus_int16 *x, - const opus_int16 *y, + const opus_val16 *x, + const opus_val16 *y, opus_val32 sum[4], int len); @@ -55,181 +75,115 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( #define xcorr_kernel(x, y, sum, len, arch) \ ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) opus_val32 celt_inner_prod_sse4_1( const opus_int16 *x, const opus_int16 *y, int N); #endif -#if defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) opus_val32 celt_inner_prod_sse2( const opus_int16 *x, const opus_int16 *y, int N); #endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse( + const opus_val16 *x, + const opus_val16 *y, + int N); +#endif + + +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse4_1(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse2(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse(x, y, N)) + + +#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) + extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( - const opus_int16 *x, - const opus_int16 *y, + const opus_val16 *x, + const opus_val16 *y, int N); #define OVERRIDE_CELT_INNER_PROD #define celt_inner_prod(x, y, N, arch) \ ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) -#else -#include <xmmintrin.h> -#include "arch.h" +#endif -#define OVERRIDE_XCORR_KERNEL -static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) -{ - int j; - __m128 xsum1, xsum2; - xsum1 = _mm_loadu_ps(sum); - xsum2 = _mm_setzero_ps(); - - for (j = 0; j < len-3; j += 4) - { - __m128 x0 = _mm_loadu_ps(x+j); - __m128 yj = _mm_loadu_ps(y+j); - __m128 y3 = _mm_loadu_ps(y+j+3); - - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), - _mm_shuffle_ps(yj,y3,0x49))); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), - _mm_shuffle_ps(yj,y3,0x9e))); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); - } - if (j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - } - } - } - _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); -} - -#define xcorr_kernel(_x, _y, _z, len, arch) \ - ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len)) +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) #define OVERRIDE_DUAL_INNER_PROD -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) -{ - int i; - __m128 xsum1, xsum2; - xsum1 = _mm_setzero_ps(); - xsum2 = _mm_setzero_ps(); - for (i=0;i<N-3;i+=4) - { - __m128 xi = _mm_loadu_ps(x+i); - __m128 y1i = _mm_loadu_ps(y01+i); - __m128 y2i = _mm_loadu_ps(y02+i); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); - } - /* Horizontal sum */ - xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); - xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); - _mm_store_ss(xy1, xsum1); - xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); - xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); - _mm_store_ss(xy2, xsum2); - for (;i<N;i++) - { - *xy1 = MAC16_16(*xy1, x[i], y01[i]); - *xy2 = MAC16_16(*xy2, x[i], y02[i]); - } -} +#define OVERRIDE_COMB_FILTER_CONST -#define OVERRIDE_CELT_INNER_PROD -static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, - int N) -{ - int i; - float xy; - __m128 sum; - sum = _mm_setzero_ps(); - /* FIXME: We should probably go 8-way and use 2 sums. */ - for (i=0;i<N-3;i+=4) - { - __m128 xi = _mm_loadu_ps(x+i); - __m128 yi = _mm_loadu_ps(y+i); - sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); - } - /* Horizontal sum */ - sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); - sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); - _mm_store_ss(&xy, sum); - for (;i<N;i++) - { - xy = MAC16_16(xy, x[i], y[i]); - } - return xy; -} - -# define celt_inner_prod(_x, _y, len, arch) \ - ((void)(arch),celt_inner_prod_sse(_x, _y, len)) +void dual_inner_prod_sse(const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +void comb_filter_const_sse(opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + + +#if defined(OPUS_X86_PRESUME_SSE) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) #define OVERRIDE_COMB_FILTER_CONST -static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12) -{ - int i; - __m128 x0v; - __m128 g10v, g11v, g12v; - g10v = _mm_load1_ps(&g10); - g11v = _mm_load1_ps(&g11); - g12v = _mm_load1_ps(&g12); - x0v = _mm_loadu_ps(&x[-T-2]); - for (i=0;i<N-3;i+=4) - { - __m128 yi, yi2, x1v, x2v, x3v, x4v; - const opus_val32 *xp = &x[i-T-2]; - yi = _mm_loadu_ps(x+i); - x4v = _mm_loadu_ps(xp+4); -#if 0 - /* Slower version with all loads */ - x1v = _mm_loadu_ps(xp+1); - x2v = _mm_loadu_ps(xp+2); - x3v = _mm_loadu_ps(xp+3); -#else - x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); - x1v = _mm_shuffle_ps(x0v, x2v, 0x99); - x3v = _mm_shuffle_ps(x2v, x4v, 0x99); -#endif - yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); -#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ - yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); - yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); #else - /* Use partial sums */ - yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), - _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); - yi = _mm_add_ps(yi, yi2); + +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) + +extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + +#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12)) + +#define NON_STATIC_COMB_FILTER_CONST_C + #endif - x0v=x4v; - _mm_storeu_ps(y+i, yi); - } -#ifdef CUSTOM_MODES - for (;i<N;i++) - { - y[i] = x[i] - + MULT16_32_Q15(g10,x[i-T]) - + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) - + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); - } #endif -} #endif -#endif diff --git a/celt/x86/pitch_sse2.c b/celt/x86/pitch_sse2.c new file mode 100644 index 0000000..a0e7d1b --- /dev/null +++ b/celt/x86/pitch_sse2.c @@ -0,0 +1,95 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "macros.h" +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32( acc1, acc2 ); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} +#endif diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c new file mode 100644 index 0000000..a092c68 --- /dev/null +++ b/celt/x86/pitch_sse4_1.c @@ -0,0 +1,195 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "macros.h" +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +#include <smmintrin.h> +#include "x86cpu.h" + +opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + __m128i inVec1_3210, inVec2_3210; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32(acc1, acc2); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + if (N - i >= 4) + { + inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); + inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); + + inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); + + acc1 = _mm_add_epi32(acc1, inVec1_3210); + i += 4; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); + + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) + { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} + +void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) +{ + int j; + + __m128i vecX, vecX0, vecX1, vecX2, vecX3; + __m128i vecY0, vecY1, vecY2, vecY3; + __m128i sum0, sum1, sum2, sum3, vecSum; + __m128i initSum; + + celt_assert(len >= 3); + + sum0 = _mm_setzero_si128(); + sum1 = _mm_setzero_si128(); + sum2 = _mm_setzero_si128(); + sum3 = _mm_setzero_si128(); + + for (j=0;j<(len-7);j+=8) + { + vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); + vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); + vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); + vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); + vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); + + sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); + sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); + sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); + sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); + } + + sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); + sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); + + sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); + sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); + + sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); + sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); + + sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); + sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); + + vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), + _mm_unpacklo_epi32(sum2, sum3)); + + for (;j<(len-3);j+=4) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX1 = _mm_shuffle_epi32(vecX, 0x55); + vecX2 = _mm_shuffle_epi32(vecX, 0xaa); + vecX3 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); + vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + sum3 = _mm_mullo_epi32(vecX3, vecY3); + + sum0 = _mm_add_epi32(sum0, sum1); + sum2 = _mm_add_epi32(sum2, sum3); + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum2); + } + + for (;j<len;j++) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); + } + + initSum = _mm_loadu_si128((__m128i *)(&sum[0])); + initSum = _mm_add_epi32(initSum, vecSum); + _mm_storeu_si128((__m128i *)sum, initSum); +} +#endif diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 83410db..1ed2acb 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -38,6 +38,8 @@ # if defined(FIXED_POINT) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1) + void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *num, @@ -49,8 +51,8 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( ) = { celt_fir_c, /* non-sse */ celt_fir_c, + celt_fir_c, MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ - NULL }; void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( @@ -61,24 +63,86 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( ) = { xcorr_kernel_c, /* non-sse */ xcorr_kernel_c, + xcorr_kernel_c, MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ - NULL }; +#endif + +#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) + opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, int N ) = { celt_inner_prod_c, /* non-sse */ + celt_inner_prod_c, MAY_HAVE_SSE2(celt_inner_prod), MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ - NULL }; +#endif + # else -# error "Floating-point implementation is not supported by x86 RTCD yet." \ - "Reconfigure with --disable-rtcd or send patches." -# endif +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) + +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len +) = { + xcorr_kernel_c, /* non-sse */ + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel), +}; + +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N +) = { + celt_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod), +}; + +void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2 +) = { + dual_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod), +}; + +void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12 +) = { + comb_filter_const_c, /* non-sse */ + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const), +}; + + +#endif + +#endif #endif diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c index c82a4b7..afcdeb6 100644 --- a/celt/x86/x86cpu.c +++ b/celt/x86/x86cpu.c @@ -35,10 +35,19 @@ #include "pitch.h" #include "x86cpu.h" +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) + + #if defined(_MSC_VER) #include <intrin.h> -#define cpuid(info,x) __cpuid(info,x) +static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) +{ + __cpuid((int*)CPUInfo, InfoType); +} + #else #if defined(CPU_INFO_BY_C) @@ -48,14 +57,28 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) { #if defined(CPU_INFO_BY_ASM) +#if defined(__i386__) && defined(__PIC__) +/* %ebx is PIC register in 32-bit, so mustn't clobber it. */ + __asm__ __volatile__ ( + "xchg %%ebx, %1\n" + "cpuid\n" + "xchg %%ebx, %1\n": + "=a" (CPUInfo[0]), + "=r" (CPUInfo[1]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "0" (InfoType) + ); +#else __asm__ __volatile__ ( "cpuid": "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "a" (InfoType), "c" (0) + "0" (InfoType) ); +#endif #elif defined(CPU_INFO_BY_C) __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); #endif @@ -63,11 +86,9 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) #endif -#include "SigProc_FIX.h" -#include "celt_lpc.h" - typedef struct CPU_Feature{ /* SIMD: 128-bit */ + int HW_SSE; int HW_SSE2; int HW_SSE41; } CPU_Feature; @@ -82,19 +103,31 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) if (nIds >= 1){ cpuid(info, 1); + cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0; cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; } + else { + cpu_feature->HW_SSE = 0; + cpu_feature->HW_SSE2 = 0; + cpu_feature->HW_SSE41 = 0; + } } int opus_select_arch(void) { - CPU_Feature cpu_feature = {0}; + CPU_Feature cpu_feature; int arch; opus_cpu_feature_check(&cpu_feature); arch = 0; + if (!cpu_feature.HW_SSE) + { + return arch; + } + arch++; + if (!cpu_feature.HW_SSE2) { return arch; @@ -109,3 +142,5 @@ int opus_select_arch(void) return arch; } + +#endif diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h index ef53f0c..7f4c61d 100644 --- a/celt/x86/x86cpu.h +++ b/celt/x86/x86cpu.h @@ -28,6 +28,12 @@ #if !defined(X86CPU_H) # define X86CPU_H +# if defined(OPUS_X86_MAY_HAVE_SSE) +# define MAY_HAVE_SSE(name) name ## _sse +# else +# define MAY_HAVE_SSE(name) name ## _c +# endif + # if defined(OPUS_X86_MAY_HAVE_SSE2) # define MAY_HAVE_SSE2(name) name ## _sse2 # else @@ -55,21 +61,25 @@ int opus_select_arch(void); reference in the PMOVSXWD instruction itself, but gcc is not smart enough to optimize this out when optimizations ARE enabled. - It appears clang requires us to do this always (which is fair, since - technically the compiler is always allowed to do the dereference before - invoking the function implementing the intrinsic). I have not investiaged - whether it is any smarter than gcc when it comes to eliminating the extra - load instruction.*/ + Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 + (which is fair, since technically the compiler is always allowed to do the + dereference before invoking the function implementing the intrinsic). + However, it is smart enough to eliminate the extra MOVD instruction. + For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out + the extra MOVQ if it's specified explicitly */ + # if defined(__clang__) || !defined(__OPTIMIZE__) # define OP_CVTEPI8_EPI32_M32(x) \ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) - -# define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) # else # define OP_CVTEPI8_EPI32_M32(x) \ (_mm_cvtepi8_epi32(*(__m128i *)(x))) +#endif +# if !defined(__OPTIMIZE__) +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) +# else # define OP_CVTEPI16_EPI32_M64(x) \ (_mm_cvtepi16_epi32(*(__m128i *)(x))) # endif diff --git a/celt_sources.mk b/celt_sources.mk index 7121301..2ffe99a 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -21,7 +21,10 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \ celt/x86/x86_celt_map.c \ celt/x86/pitch_sse.c -CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c +CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c + +CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \ +celt/x86/pitch_sse4_1.c CELT_SOURCES_ARM = \ celt/arm/armcpu.c \ diff --git a/configure.ac b/configure.ac index baa3425..9b05fc1 100644 --- a/configure.ac +++ b/configure.ac @@ -348,8 +348,24 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], [test x"${asm_optimization%% *}" = x"ARM"]) -AM_CONDITIONAL([HAVE_SSE4_1], [false]) +AM_CONDITIONAL([HAVE_SSE], [false]) AM_CONDITIONAL([HAVE_SSE2], [false]) +AM_CONDITIONAL([HAVE_SSE4_1], [false]) + +m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse]) +m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2]) +m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1]) +m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon]) + +AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics @<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@]) +AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics @<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@]) +AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@]) +AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS[@:>@]) + +AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS], DEFAULT_X86_SSE_CFLAGS)]) +AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS], DEFAULT_X86_SSE2_CFLAGS)]) +AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS], DEFAULT_X86_SSE4_1_CFLAGS)]) +AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS], DEFAULT_ARM_NEON_INTR_CFLAGS)]) AC_DEFUN([OPUS_PATH_NE10], [ @@ -426,45 +442,53 @@ AC_DEFUN([OPUS_PATH_NE10], ) AS_IF([test x"$enable_intrinsics" = x"yes"],[ - case $host_cpu in - arm*) + intrinsics_support="" + AS_CASE([$host_cpu], + [arm*], + [ cpu_arm=yes - AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics) - save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" - AC_LINK_IFELSE( - [ - AC_LANG_PROGRAM( - [[#include <arm_neon.h> - ]], - [[ - static float32x4_t A[2], SUMM; - SUMM = vmlaq_f32(SUMM, A[0], A[1]); - ]] - ) - ],[ - OPUS_ARM_NEON_INTR=1 - AC_MSG_RESULT([yes]) - ],[ - OPUS_ARM_NEON_INTR=0 - AC_MSG_RESULT([no]) - ] + OPUS_CHECK_INTRINSICS( + [ARM Neon], + [$ARM_NEON_INTR_CFLAGS], + [OPUS_ARM_MAY_HAVE_NEON_INTR], + [OPUS_ARM_PRESUME_NEON_INTR], + [[#include <arm_neon.h> + ]], + [[ + static float32x4_t A0, A1, SUMM; + SUMM = vmlaq_f32(SUMM, A0, A1); + ]] + ) + AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], + [ + OPUS_ARM_NEON_INTR_CFLAGS="$ARM_NEON_INTR_CFLAGS" + AC_SUBST([OPUS_ARM_NEON_INTR_CFLAGS]) + ] ) - CFLAGS="$save_CFLAGS" - #Now we know if compiler supports ARM neon intrinsics or not - #Currently we only have intrinsic optimization for floating point + #Currently we only have intrinsic optimizations for floating point AS_IF([test x"$enable_float" = x"yes"], [ - AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], + AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"], [ - AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics]) - AS_IF([test x"enable_rtcd" != x""], - [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[]) - enable_intrinsics="$enable_intrinsics ARMv7_Neon_Intrinsics" - dnl Don't see why defining these is necessary to check features at runtime - AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support EDSP Instructions]) - AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support MEDIA Instructions]) - AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions]) + AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1, + [Compiler supports ARMv7 Neon Intrinsics]) + intrinsics_support="$intrinsics_support (Neon_Intrinsics)" + + AS_IF([test x"enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], + [rtcd_support="$rtcd_support (ARMv7_Neon_Intrinsics)"],[]) + + AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" = x"1"], + [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1, + [Define if binary requires NEON intrinsics support])]) + + AS_IF([test x"$rtcd_support" = x""], + [rtcd_support=no]) + + AS_IF([test x"$intrinsics_support" = x""], + [intrinsics_support=no], + [intrinsics_support="arm$intrinsics_support"]) + OPUS_PATH_NE10() AS_IF([test x"$NE10_LIBS" != "x"], @@ -472,18 +496,122 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ ], [ AC_MSG_WARN([Compiler does not support ARM intrinsics]) - enable_intrinsics=no + intrinsics_support=no ]) ], [ - AC_MSG_WARN([Currently on have ARM intrinsics for float]) - enable_intrinsics=no + AC_MSG_WARN([Currently only have ARM intrinsics for float]) + intrinsics_support=no ]) - ;; - "i386" | "i686" | "x86_64") - AS_IF([test x"$enable_float" = x"no"],[ - AS_IF([test x"$enable_rtcd" = x"yes"],[ + ], + [i?86|x86_64], + [ + OPUS_CHECK_INTRINSICS( + [SSE], + [$X86_SSE_CFLAGS], + [OPUS_X86_MAY_HAVE_SSE], + [OPUS_X86_PRESUME_SSE], + [[#include <xmmintrin.h> + ]], + [[ + static __m128 mtest; + mtest = _mm_setzero_ps(); + ]] + ) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1" && test x"$OPUS_X86_PRESUME_SSE" != x"1"], + [ + OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS" + AC_SUBST([OPUS_X86_SSE_CFLAGS]) + ] + ) + OPUS_CHECK_INTRINSICS( + [SSE2], + [$X86_SSE2_CFLAGS], + [OPUS_X86_MAY_HAVE_SSE2], + [OPUS_X86_PRESUME_SSE2], + [[#include <emmintrin.h> + ]], + [[ + static __m128i mtest; + mtest = _mm_setzero_si128(); + ]] + ) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1" && test x"$OPUS_X86_PRESUME_SSE2" != x"1"], + [ + OPUS_X86_SSE2_CFLAGS="$X86_SSE2_CFLAGS" + AC_SUBST([OPUS_X86_SSE2_CFLAGS]) + ] + ) + OPUS_CHECK_INTRINSICS( + [SSE4.1], + [$X86_SSE4_1_CFLAGS], + [OPUS_X86_MAY_HAVE_SSE4_1], + [OPUS_X86_PRESUME_SSE4_1], + [[#include <smmintrin.h> + ]], + [[ + static __m128i mtest; + mtest = _mm_setzero_si128(); + mtest = _mm_cmpeq_epi64(mtest, mtest); + ]] + ) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1" && test x"$OPUS_X86_PRESUME_SSE4_1" != x"1"], + [ + OPUS_X86_SSE4_1_CFLAGS="$X86_SSE4_1_CFLAGS" + AC_SUBST([OPUS_X86_SSE4_1_CFLAGS]) + ] + ) + + AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""]) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"], + [ + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE], 1, [Compiler supports X86 SSE Intrinsics]) + intrinsics_support="$intrinsics_support SSE" + + AS_IF([test x"$OPUS_X86_PRESUME_SSE" = x"1"], + [AC_DEFINE([OPUS_X86_PRESUME_SSE], 1, [Define if binary requires SSE intrinsics support])], + [rtcd_support="$rtcd_support SSE"]) + ], + [ + AC_MSG_WARN([Compiler does not support SSE intrinsics]) + ]) + + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"], + [ + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], 1, [Compiler supports X86 SSE2 Intrinsics]) + intrinsics_support="$intrinsics_support SSE2" + + AS_IF([test x"$OPUS_X86_PRESUME_SSE2" = x"1"], + [AC_DEFINE([OPUS_X86_PRESUME_SSE2], 1, [Define if binary requires SSE2 intrinsics support])], + [rtcd_support="$rtcd_support SSE2"]) + ], + [ + AC_MSG_WARN([Compiler does not support SSE2 intrinsics]) + ]) + + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"], + [ + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], 1, [Compiler supports X86 SSE4.1 Intrinsics]) + intrinsics_support="$intrinsics_support SSE4.1" + + AS_IF([test x"$OPUS_X86_PRESUME_SSE4_1" = x"1"], + [AC_DEFINE([OPUS_X86_PRESUME_SSE4_1], 1, [Define if binary requires SSE4.1 intrinsics support])], + [rtcd_support="$rtcd_support SSE4.1"]) + ], + [ + AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics]) + ]) + AS_IF([test x"$intrinsics_support" = x""], + [intrinsics_support=no], + [intrinsics_support="x86$intrinsics_support"] + ) + AS_IF([test x"$rtcd_support" = x""], + [rtcd_support=no], + [rtcd_support="x86$rtcd_support"], + ) + + AS_IF([test x"$enable_rtcd" = x"yes" && test x"$rtcd_support" != x""],[ get_cpuid_by_asm="no" - AC_MSG_CHECKING([Get CPU Info]) + AC_MSG_CHECKING([How to get X86 CPU Info]) AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #include <stdio.h> ]],[[ @@ -493,7 +621,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ unsigned int CPUInfo3; unsigned int InfoType; __asm__ __volatile__ ( - "cpuid11": + "cpuid": "=a" (CPUInfo0), "=b" (CPUInfo1), "=c" (CPUInfo2), @@ -502,7 +630,8 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ ); ]])], [get_cpuid_by_asm="yes" - AC_MSG_RESULT([Inline Assembly])], + AC_MSG_RESULT([Inline Assembly]) + AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], [AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #include <cpuid.h> ]],[[ @@ -513,90 +642,31 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ unsigned int InfoType; __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3); ]])], - [AC_MSG_RESULT([C method])], - [AC_MSG_ERROR([not support Get CPU Info, please disable intrinsics ])])]) - - AC_MSG_CHECKING([sse4.1]) - TMP_CFLAGS="$CFLAGS" - gcc -Q --help=target | grep "\-msse4.1 " - AS_IF([test x"$?" = x"0"],[ - CFLAGS="$CFLAGS -msse4.1" - AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find xmmintrin.h])]) - AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find emmintrin.h])]) - AC_CHECK_HEADER(smmintrin.h, [], [AC_MSG_ERROR([Couldn't find smmintrin.h])],[ - #ifdef HAVE_XMMINSTRIN_H - #include <xmmintrin.h> - #endif - #ifdef HAVE_EMMINSTRIN_H - #include <emmintrin.h> - #endif - ]) - - AC_LINK_IFELSE([AC_LANG_PROGRAM([[ - #include <xmmintrin.h> - #include <emmintrin.h> - #include <smmintrin.h> - ]],[[ - __m128i mtest = _mm_setzero_si128(); - mtest = _mm_cmpeq_epi64(mtest, mtest); - ]])], - [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker failure for sse4.1, please disable intrinsics])]) - - CFLAGS="$TMP_CFLAGS" - AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], [1], [For x86 sse4.1 instrinsics optimizations]) - AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics optimizations]) - rtcd_support="x86 sse4.1" - AM_CONDITIONAL([HAVE_SSE4_1], [true]) - AM_CONDITIONAL([HAVE_SSE2], [true]) - AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], - [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])]) - ],[ ##### Else case for AS_IF([test x"$?" = x"0"]) - gcc -Q --help=target | grep "\-msse2 " - AC_MSG_CHECKING([sse2]) - AS_IF([test x"$?" = x"0"],[ - AC_MSG_RESULT([yes]) - CFLAGS="$CFLAGS -msse2" - AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find xmmintrin.h])]) - AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find emmintrin.h])]) - - AC_LINK_IFELSE([AC_LANG_PROGRAM([[ - #include <xmmintrin.h> - #include <emmintrin.h> - ]],[[ - __m128i mtest = _mm_setzero_si128(); - ]])], - [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker failure for sse2, please disable intrinsics])]) - - CFLAGS="$TMP_CFLAGS" - AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics optimize]) - rtcd_support="x86 sse2" - AM_CONDITIONAL([HAVE_SSE2], [true]) - AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], - [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])]) - ],[enable_intrinsics="no"]) #End of AS_IF([test x"$?" = x"0"] - ]) - ], [ - enable_intrinsics="no" - ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"] -], -[ ## Else case for AS_IF([test x"$enable_float" = x"no"] - AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed point]) - enable_intrinsics="no" -]) ## End of AS_IF([test x"$enable_float" = x"no"] - ;; - *) + [AC_MSG_RESULT([C method]) + AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])], + [AC_MSG_ERROR([no supported Get CPU Info method, please disable intrinsics])])])]) + ], + [ AC_MSG_WARN([No intrinsics support for your architecture]) - enable_intrinsics="no" - ;; - esac + intrinsics_support="no" + ]) +], +[ + intrinsics_support="no" ]) AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) AM_CONDITIONAL([OPUS_ARM_NEON_INTR], - [test x"$OPUS_ARM_NEON_INTR" = x"1"]) + [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"]) AM_CONDITIONAL([HAVE_ARM_NE10], [test x"$HAVE_ARM_NE10" = x"1"]) +AM_CONDITIONAL([HAVE_SSE], + [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"]) +AM_CONDITIONAL([HAVE_SSE2], + [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"]) +AM_CONDITIONAL([HAVE_SSE4_1], + [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"]) AS_IF([test x"$enable_rtcd" = x"yes"],[ AS_IF([test x"$rtcd_support" != x"no"],[ @@ -704,7 +774,7 @@ AC_MSG_NOTICE([ Fixed point debugging: ......... ${enable_fixed_point_debug} Inline Assembly Optimizations: . ${inline_optimization} External Assembly Optimizations: ${asm_optimization} - Intrinsics Optimizations.......: ${enable_intrinsics} + Intrinsics Optimizations.......: ${intrinsics_support} Run-time CPU detection: ........ ${rtcd_support} Custom modes: .................. ${enable_custom_modes} Assertion checking: ............ ${enable_assertions} diff --git a/m4/opus-intrinsics.m4 b/m4/opus-intrinsics.m4 new file mode 100644 index 0000000..c74aecd --- /dev/null +++ b/m4/opus-intrinsics.m4 @@ -0,0 +1,29 @@ +dnl opus-intrinsics.m4 +dnl macro for testing for support for compiler intrinsics, either by default or with a compiler flag + +dnl OPUS_CHECK_INTRINSICS(NAME-OF-INTRINSICS, COMPILER-FLAG-FOR-INTRINSICS, VAR-IF-PRESENT, VAR-IF-DEFAULT, TEST-PROGRAM-HEADER, TEST-PROGRAM-BODY) +AC_DEFUN([OPUS_CHECK_INTRINSICS], +[ + AC_MSG_CHECKING([if compiler supports $1 intrinsics]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM($5, $6)], + [ + $3=1 + $4=1 + AC_MSG_RESULT([yes]) + ],[ + $4=0 + AC_MSG_RESULT([no]) + AC_MSG_CHECKING([if compiler supports $1 intrinsics with $2]) + save_CFLAGS="$CFLAGS"; CFLAGS="$2 $CFLAGS" + AC_LINK_IFELSE([AC_LANG_PROGRAM($5, $6)], + [ + AC_MSG_RESULT([yes]) + $3=1 + ],[ + AC_MSG_RESULT([no]) + $3=0 + ]) + CFLAGS="$save_CFLAGS" + ]) +]) diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h index 9a0e096..61efa8d 100644 --- a/silk/x86/SigProc_FIX_sse.h +++ b/silk/x86/SigProc_FIX_sse.h @@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) + +#else + extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ @@ -59,12 +65,22 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( # define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +#endif + opus_int64 silk_inner_prod16_aligned_64_sse4_1( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ); + +#if defined(OPUS_X86_PRESUME_SSE4_1) + +#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len)) + +#else + extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( const opus_int16 *inVec1, const opus_int16 *inVec2, @@ -75,3 +91,4 @@ extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( #endif #endif +#endif diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h index f970632..afd5ec2 100644 --- a/silk/x86/main_sse.h +++ b/silk/x86/main_sse.h @@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1( opus_int L /* I number of vectors in codebook */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L, arch) \ + ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L)) + +#else + extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( opus_int8 *ind, /* O index of best codebook vector */ opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ @@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ mu_Q9, max_gain_Q7, L)) +#endif + # define OVERRIDE_silk_NSQ void silk_NSQ_sse4_1( @@ -89,6 +100,15 @@ void silk_NSQ_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +#else + extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I/O Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + # define OVERRIDE_silk_NSQ_del_dec void silk_NSQ_del_dec_sse4_1( @@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +#else + extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I/O Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + void silk_noise_shape_quantizer( silk_nsq_state *NSQ, /* I/O NSQ state */ opus_int signalType, /* I Signal type */ @@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( const opus_int16 pIn[] ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) + +#else + # define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) @@ -201,6 +239,8 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( # define OVERRIDE_silk_warped_LPC_analysis_filter_FIX +#endif + void silk_warped_LPC_analysis_filter_FIX_sse4_1( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1( const opus_int order /* I Filter order (even) */ ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ + ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) + +#else + extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -224,5 +270,7 @@ extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1]) # define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) +#endif + # endif #endif diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c index 6747d10..ad9fef2 100644 --- a/silk/x86/x86_silk_map.c +++ b/silk/x86/x86_silk_map.c @@ -35,6 +35,10 @@ #include "pitch.h" #include "main.h" +#if !defined(OPUS_X86_PRESUME_SSE4_1) + +#if defined(FIXED_POINT) + opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( const opus_int16 *inVec1, const opus_int16 *inVec2, @@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_inner_prod16_aligned_64_c, /* non-sse */ silk_inner_prod16_aligned_64_c, + silk_inner_prod16_aligned_64_c, MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */ - NULL }; +#endif + opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )( silk_encoder_state *psEncC, const opus_int16 pIn[] ) = { silk_VAD_GetSA_Q8_c, /* non-sse */ silk_VAD_GetSA_Q8_c, + silk_VAD_GetSA_Q8_c, MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */ - NULL }; void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_NSQ_c, /* non-sse */ silk_NSQ_c, + silk_NSQ_c, MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */ - NULL }; void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_VQ_WMat_EC_c, /* non-sse */ silk_VQ_WMat_EC_c, + silk_VQ_WMat_EC_c, MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */ - NULL }; void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_NSQ_del_dec_c, /* non-sse */ silk_NSQ_del_dec_c, + silk_NSQ_del_dec_c, MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ - NULL }; +#if defined(FIXED_POINT) + void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_warped_LPC_analysis_filter_FIX_c, /* non-sse */ silk_warped_LPC_analysis_filter_FIX_c, + silk_warped_LPC_analysis_filter_FIX_c, MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */ - NULL }; void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_burg_modified_c, /* non-sse */ silk_burg_modified_c, + silk_burg_modified_c, MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */ - NULL }; + +#endif +#endif diff --git a/win32/VS2010/celt.vcxproj b/win32/VS2010/celt.vcxproj index f107fec..e068fbe 100644 --- a/win32/VS2010/celt.vcxproj +++ b/win32/VS2010/celt.vcxproj @@ -37,6 +37,12 @@ <ClCompile Include="..\..\celt\quant_bands.c" /> <ClCompile Include="..\..\celt\rate.c" /> <ClCompile Include="..\..\celt\vq.c" /> + <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c" /> + <ClCompile Include="..\..\celt\x86\pitch_sse.c" /> + <ClCompile Include="..\..\celt\x86\pitch_sse2.c" /> + <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c" /> + <ClCompile Include="..\..\celt\x86\x86cpu.c" /> + <ClCompile Include="..\..\celt\x86\x86_celt_map.c" /> </ItemGroup> <ItemGroup> <ClInclude Include="..\..\celt\arch.h" /> @@ -67,6 +73,9 @@ <ClInclude Include="..\..\celt\static_modes_fixed.h" /> <ClInclude Include="..\..\celt\static_modes_float.h" /> <ClInclude Include="..\..\celt\vq.h" /> + <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h" /> + <ClInclude Include="..\..\celt\x86\pitch_sse.h" /> + <ClInclude Include="..\..\celt\x86\x86cpu.h" /> <ClInclude Include="..\..\celt\_kiss_fft_guts.h" /> </ItemGroup> <PropertyGroup Label="Globals"> @@ -141,7 +150,7 @@ <WarningLevel>Level3</WarningLevel> <Optimization>Disabled</Optimization> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> </ClCompile> <Link> @@ -168,7 +177,7 @@ <WarningLevel>Level3</WarningLevel> <Optimization>Disabled</Optimization> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> </ClCompile> <Link> @@ -196,7 +205,7 @@ <FunctionLevelLinking>true</FunctionLevelLinking> <IntrinsicFunctions>true</IntrinsicFunctions> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreaded</RuntimeLibrary> </ClCompile> <Link> @@ -227,7 +236,7 @@ <FunctionLevelLinking>true</FunctionLevelLinking> <IntrinsicFunctions>true</IntrinsicFunctions> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreaded</RuntimeLibrary> </ClCompile> <Link> diff --git a/win32/VS2010/celt.vcxproj.filters b/win32/VS2010/celt.vcxproj.filters index e3a1d97..e9948fa 100644 --- a/win32/VS2010/celt.vcxproj.filters +++ b/win32/VS2010/celt.vcxproj.filters @@ -69,6 +69,24 @@ <ClCompile Include="..\..\celt\celt.c"> <Filter>Source Files</Filter> </ClCompile> + <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\celt\x86\pitch_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\celt\x86\pitch_sse2.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\celt\x86\x86_celt_map.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\celt\x86\x86cpu.c"> + <Filter>Source Files</Filter> + </ClCompile> </ItemGroup> <ItemGroup> <ClInclude Include="..\..\celt\cwrs.h"> @@ -158,5 +176,14 @@ <ClInclude Include="..\..\celt\celt_lpc.h"> <Filter>Header Files</Filter> </ClInclude> + <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\celt\x86\pitch_sse.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\celt\x86\x86cpu.h"> + <Filter>Header Files</Filter> + </ClInclude> </ItemGroup> </Project> \ No newline at end of file diff --git a/win32/VS2010/silk_common.vcxproj b/win32/VS2010/silk_common.vcxproj index 9cf5f48..d3d077d 100644 --- a/win32/VS2010/silk_common.vcxproj +++ b/win32/VS2010/silk_common.vcxproj @@ -88,7 +88,7 @@ <WarningLevel>Level3</WarningLevel> <Optimization>Disabled</Optimization> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> </ClCompile> <Link> @@ -118,7 +118,7 @@ <WarningLevel>Level3</WarningLevel> <Optimization>Disabled</Optimization> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> </ClCompile> <Link> @@ -149,7 +149,7 @@ <FunctionLevelLinking>true</FunctionLevelLinking> <IntrinsicFunctions>true</IntrinsicFunctions> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreaded</RuntimeLibrary> <FloatingPointModel>Fast</FloatingPointModel> </ClCompile> @@ -184,7 +184,7 @@ <FunctionLevelLinking>true</FunctionLevelLinking> <IntrinsicFunctions>true</IntrinsicFunctions> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreaded</RuntimeLibrary> <FloatingPointModel>Fast</FloatingPointModel> </ClCompile> @@ -212,6 +212,8 @@ </ItemDefinitionGroup> <ItemGroup> <ClInclude Include="..\..\include\opus_types.h" /> + <ClInclude Include="..\..\silk\x86\main_sse.h" /> + <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h" /> <ClInclude Include="..\..\win32\config.h" /> <ClInclude Include="..\..\silk\control.h" /> <ClInclude Include="..\..\silk\debug.h" /> @@ -311,8 +313,13 @@ <ClCompile Include="..\..\silk\table_LSF_cos.c" /> <ClCompile Include="..\..\silk\VAD.c" /> <ClCompile Include="..\..\silk\VQ_WMat_EC.c" /> + <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c" /> + <ClCompile Include="..\..\silk\x86\NSQ_sse.c" /> + <ClCompile Include="..\..\silk\x86\VAD_sse.c" /> + <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c" /> + <ClCompile Include="..\..\silk\x86\x86_silk_map.c" /> </ItemGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <ImportGroup Label="ExtensionTargets"> </ImportGroup> -</Project> +</Project> \ No newline at end of file diff --git a/win32/VS2010/silk_common.vcxproj.filters b/win32/VS2010/silk_common.vcxproj.filters index 30db48e..341180b 100644 --- a/win32/VS2010/silk_common.vcxproj.filters +++ b/win32/VS2010/silk_common.vcxproj.filters @@ -81,6 +81,12 @@ <ClInclude Include="..\..\silk\typedef.h"> <Filter>Header Files</Filter> </ClInclude> + <ClInclude Include="..\..\silk\x86\main_sse.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h"> + <Filter>Header Files</Filter> + </ClInclude> </ItemGroup> <ItemGroup> <ClCompile Include="..\..\silk\VQ_WMat_EC.c"> @@ -311,5 +317,20 @@ <ClCompile Include="..\..\silk\VAD.c"> <Filter>Source Files</Filter> </ClCompile> + <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\silk\x86\NSQ_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\silk\x86\VAD_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\silk\x86\x86_silk_map.c"> + <Filter>Source Files</Filter> + </ClCompile> </ItemGroup> -</Project> +</Project> \ No newline at end of file diff --git a/win32/VS2010/silk_fixed.vcxproj b/win32/VS2010/silk_fixed.vcxproj index 5ea1a91..522101e 100644 --- a/win32/VS2010/silk_fixed.vcxproj +++ b/win32/VS2010/silk_fixed.vcxproj @@ -86,7 +86,7 @@ <WarningLevel>Level3</WarningLevel> <Optimization>Disabled</Optimization> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> </ClCompile> <Link> @@ -104,7 +104,7 @@ <WarningLevel>Level3</WarningLevel> <Optimization>Disabled</Optimization> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> </ClCompile> <Link> @@ -123,7 +123,7 @@ <FunctionLevelLinking>true</FunctionLevelLinking> <IntrinsicFunctions>true</IntrinsicFunctions> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreaded</RuntimeLibrary> </ClCompile> <Link> @@ -145,7 +145,7 @@ <FunctionLevelLinking>true</FunctionLevelLinking> <IntrinsicFunctions>true</IntrinsicFunctions> <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> <RuntimeLibrary>MultiThreaded</RuntimeLibrary> </ClCompile> <Link> @@ -191,8 +191,11 @@ <ClCompile Include="..\..\silk\fixed\solve_LS_FIX.c" /> <ClCompile Include="..\..\silk\fixed\vector_ops_FIX.c" /> <ClCompile Include="..\..\silk\fixed\warped_autocorrelation_FIX.c" /> + <ClCompile Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c" /> + <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c" /> + <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c" /> </ItemGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <ImportGroup Label="ExtensionTargets"> </ImportGroup> -</Project> +</Project> \ No newline at end of file diff --git a/win32/VS2010/silk_fixed.vcxproj.filters b/win32/VS2010/silk_fixed.vcxproj.filters index 6897930..c2327eb 100644 --- a/win32/VS2010/silk_fixed.vcxproj.filters +++ b/win32/VS2010/silk_fixed.vcxproj.filters @@ -18,16 +18,16 @@ <ClInclude Include="..\..\win32\config.h"> <Filter>Header Files</Filter> </ClInclude> - <ClInclude Include="main_FIX.h"> + <ClInclude Include="..\..\include\opus_types.h"> <Filter>Header Files</Filter> </ClInclude> - <ClInclude Include="..\SigProc_FIX.h"> + <ClInclude Include="..\..\silk\SigProc_FIX.h"> <Filter>Header Files</Filter> </ClInclude> - <ClInclude Include="structs_FIX.h"> + <ClInclude Include="..\..\silk\fixed\main_FIX.h"> <Filter>Header Files</Filter> </ClInclude> - <ClInclude Include="..\..\include\opus_types.h"> + <ClInclude Include="..\..\silk\fixed\structs_FIX.h"> <Filter>Header Files</Filter> </ClInclude> </ItemGroup> @@ -107,5 +107,14 @@ <ClCompile Include="..\..\silk\fixed\LTP_analysis_filter_FIX.c"> <Filter>Source Files</Filter> </ClCompile> + <ClCompile Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c"> + <Filter>Source Files</Filter> + </ClCompile> </ItemGroup> </Project> \ No newline at end of file diff --git a/win32/config.h b/win32/config.h index 46ff699..10fbf33 100644 --- a/win32/config.h +++ b/win32/config.h @@ -35,9 +35,28 @@ POSSIBILITY OF SUCH DAMAGE. #define OPUS_BUILD 1 -/* Enable SSE functions, if compiled with SSE/SSE2 (note that AMD64 implies SSE2) */ -#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) -#define __SSE__ 1 +#if defined(_M_IX86) || defined(_M_X64) +/* Can always build with SSE intrinsics (no special compiler flags necessary) */ +#define OPUS_X86_MAY_HAVE_SSE +#define OPUS_X86_MAY_HAVE_SSE2 +#define OPUS_X86_MAY_HAVE_SSE4_1 + +/* Presume SSE functions, if compiled with SSE/SSE2/AVX (note that AMD64 implies SSE2, and AVX + implies SSE4.1) */ +#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) || defined(__AVX__) +#define OPUS_X86_PRESUME_SSE 1 +#endif +#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(__AVX__) +#define OPUS_X86_PRESUME_SSE2 1 +#endif +#if defined(__AVX__) +#define OPUS_X86_PRESUME_SSE4_1 1 +#endif + +#if !defined(OPUS_X86_PRESUME_SSE4_1) || !defined(OPUS_X86_PRESUME_SSE2) || !defined(OPUS_X86_PRESUME_SSE) +#define OPUS_HAVE_RTCD 1 +#endif + #endif #include "version.h" -- 1.9.1
Viswanath Puttagunta
2015-Mar-12 18:06 UTC
[opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.
Hi All, I have rebased Jonathan?s patch [1] (which I will address as RFCv1) on top of my ARM neon intrinsics/NE10 work. [2], [3] I have reviewed the patch, rebased it and make sure it compiles. Compile configure options checked ../opus/configure ../opus/configure ?enable-intrinscs ../opus/configure ?host=arm-linux-gnueabihf ../opus/configure ?host=arm-linux-gnueabihf ?enable-intrinsics ../opus/configure ?host=aarch64-gnu-linux ../opus/configure ?host=aarch64-gnu-linux I verified it works on x86 (SSE4.1). (Encode/Decode celt basic tests) I will verify on ARM(64) hardware later on. (on road now) I will base my future work on this patch. This patch, in addition to ARM NE10 patches posted [2], [3] are available at [4] for any reference. Note that I don?t have experience with x86 intrinsics/instructions. So, I reviewed it from generic code fixes and any impact on ARM perspective. Please provide feedback/review so this work can be merged. and we don?t have to keep rebasing our future work. [1]: http://lists.xiph.org/pipermail/opus/2015-March/002899.html [2]: http://lists.xiph.org/pipermail/opus/2015-March/002905.html [3]: http://lists.xiph.org/pipermail/opus/2015-March/002919.html [4]: https://git.linaro.org/people/viswanath.puttagunta/opus.git Branch: opus-dev-pending Regards, Vish On 12 March 2015 at 12:45, Viswanath Puttagunta <viswanath.puttagunta at linaro.org> wrote:> From: Jonathan Lennox <jonathan at vidyo.com> > > * Makes ?enable-intrinsics work with clang and other non-GCC compilers > * Enables RTCD for the floating-point-mode SSE code in Celt. > * Disables use of RTCD in cases where the compiler targets an instruction set by default. > * Enables the SSE4.1 Silk optimizations that apply to the common parts of Silk when Opus is built in floating-point mode, not just in fixed-point mode. > * Enables the SSE intrinsics (with RTCD when appropriate) in the Win32 build. > * Fixes a case where GCC would compile SSE2 code as SSE4.1, causing a crash on non-SSE4.1 CPUs. > * Allows configuration with compilers with non-GCC-flavor flags for enabling architecture options. > * Hopefully makes the configuration and ifdef?s easier to follow and understand. > > Reviewed-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org> > --- > Makefile.am | 38 ++-- > celt/arm/armcpu.c | 6 +- > celt/arm/pitch_arm.h | 4 +- > celt/bands.c | 6 +- > celt/celt.c | 16 +- > celt/celt.h | 12 +- > celt/celt_decoder.c | 6 +- > celt/celt_encoder.c | 4 +- > celt/celt_lpc.h | 2 +- > celt/cpu_support.h | 15 +- > celt/mips/celt_mipsr1.h | 2 +- > celt/pitch.c | 4 +- > celt/pitch.h | 19 +- > celt/tests/test_unit_dft.c | 4 +- > celt/tests/test_unit_mathops.c | 11 +- > celt/tests/test_unit_mdct.c | 4 +- > celt/tests/test_unit_rotation.c | 11 +- > celt/x86/celt_lpc_sse.c | 4 + > celt/x86/celt_lpc_sse.h | 12 +- > celt/x86/pitch_sse.c | 334 +++++++++++++------------------ > celt/x86/pitch_sse.h | 256 ++++++++++------------- > celt/x86/pitch_sse2.c | 95 +++++++++ > celt/x86/pitch_sse4_1.c | 195 ++++++++++++++++++ > celt/x86/x86_celt_map.c | 76 ++++++- > celt/x86/x86cpu.c | 47 ++++- > celt/x86/x86cpu.h | 26 ++- > celt_sources.mk | 5 +- > configure.ac | 312 ++++++++++++++++++----------- > m4/opus-intrinsics.m4 | 29 +++ > silk/x86/SigProc_FIX_sse.h | 17 ++ > silk/x86/main_sse.h | 48 +++++ > silk/x86/x86_silk_map.c | 25 ++- > win32/VS2010/celt.vcxproj | 17 +- > win32/VS2010/celt.vcxproj.filters | 27 +++ > win32/VS2010/silk_common.vcxproj | 17 +- > win32/VS2010/silk_common.vcxproj.filters | 23 ++- > win32/VS2010/silk_fixed.vcxproj | 13 +- > win32/VS2010/silk_fixed.vcxproj.filters | 17 +- > win32/config.h | 25 ++- > 39 files changed, 1210 insertions(+), 574 deletions(-) > create mode 100644 celt/x86/pitch_sse2.c > create mode 100644 celt/x86/pitch_sse4_1.c > create mode 100644 m4/opus-intrinsics.m4 > > diff --git a/Makefile.am b/Makefile.am > index c5c1562..3a75740 100644 > --- a/Makefile.am > +++ b/Makefile.am > @@ -23,6 +23,9 @@ SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1) > endif > else > SILK_SOURCES += $(SILK_SOURCES_FLOAT) > +if HAVE_SSE4_1 > +SILK_SOURCES += $(SILK_SOURCES_SSE4_1) > +endif > endif > > if DISABLE_FLOAT_API > @@ -30,12 +33,14 @@ else > OPUS_SOURCES += $(OPUS_SOURCES_FLOAT) > endif > > -if HAVE_SSE4_1 > -CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1) > -else > -if HAVE_SSE2 > +if HAVE_SSE > CELT_SOURCES += $(CELT_SOURCES_SSE) > endif > +if HAVE_SSE2 > +CELT_SOURCES += $(CELT_SOURCES_SSE2) > +endif > +if HAVE_SSE4_1 > +CELT_SOURCES += $(CELT_SOURCES_SSE4_1) > endif > > if CPU_ARM > @@ -44,7 +49,6 @@ SILK_SOURCES += $(SILK_SOURCES_ARM) > > if OPUS_ARM_NEON_INTR > CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR) > -OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon > endif > > if HAVE_ARM_NE10 > @@ -262,20 +266,30 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl > %-gnu.S: %.s > $(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@ > > -SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o > +OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \ > + $(celt_tests_test_unit_rotation_SOURCES:.c=.o) > + > +if HAVE_SSE > +SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo) > +$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS) > +endif > > -if HAVE_SSE4_1 > -$(SSE_OBJ): CFLAGS += -msse4.1 > -else > if HAVE_SSE2 > -$(SSE_OBJ): CFLAGS += -msse2 > +SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) > +$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS) > endif > + > +if HAVE_SSE4_1 > +SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \ > + $(SILK_SOURCES_SSE4_1:.c=.lo) \ > + $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo) > +$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) > endif > > if OPUS_ARM_NEON_INTR > CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ > $(CELT_SOURCES_ARM_NE10:.c=.lo) \ > - %test_unit_rotation.o %test_unit_mathops.o \ > %test_unit_mdct.o %test_unit_dft.o > -$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS) $(NE10_CFLAGS) > + > +$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS) > endif > diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c > index 1768525..5e5d10c 100644 > --- a/celt/arm/armcpu.c > +++ b/celt/arm/armcpu.c > @@ -73,7 +73,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ > __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ > /*Ignore exception.*/ > } > -# if defined(OPUS_ARM_MAY_HAVE_NEON) > +# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) > __try{ > /*VORR q0,q0,q0*/ > __emit(0xF2200150); > @@ -107,7 +107,7 @@ opus_uint32 opus_cpu_capabilities(void) > > while(fgets(buf, 512, cpuinfo) != NULL) > { > -# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) > +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) > /* Search for edsp and neon flag */ > if(memcmp(buf, "Features", 8) == 0) > { > @@ -118,7 +118,7 @@ opus_uint32 opus_cpu_capabilities(void) > flags |= OPUS_CPU_ARM_EDSP; > # endif > > -# if defined(OPUS_ARM_MAY_HAVE_NEON) > +# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) > p = strstr(buf, " neon"); > if(p != NULL && (p[5] == ' ' || p[5] == '\n')) > flags |= OPUS_CPU_ARM_NEON; > diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h > index 125d1bc..8626ed7 100644 > --- a/celt/arm/pitch_arm.h > +++ b/celt/arm/pitch_arm.h > @@ -54,10 +54,10 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, > > #else /* Start !FIXED_POINT */ > /* Float case */ > -#if defined(OPUS_ARM_NEON_INTR) > +#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) > void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, > opus_val32 *xcorr, int len, int max_pitch); > -#if !defined(OPUS_HAVE_RTCD) > +#if !defined(OPUS_HAVE_RTCD) || defined(OPUS_ARM_PRESUME_NEON_INTR) > #define OVERRIDE_PITCH_XCORR (1) > # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ > ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch)) > diff --git a/celt/bands.c b/celt/bands.c > index c643b09..25f229e 100644 > --- a/celt/bands.c > +++ b/celt/bands.c > @@ -398,7 +398,7 @@ static void stereo_split(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT > } > } > > -static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N) > +static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N, int arch) > { > int j; > opus_val32 xp=0, side=0; > @@ -410,7 +410,7 @@ static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT > opus_val32 t, lgain, rgain; > > /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ > - dual_inner_prod(Y, X, Y, N, &xp, &side); > + dual_inner_prod(Y, X, Y, N, &xp, &side, arch); > /* Compensating for the mid normalization */ > xp = MULT16_32_Q15(mid, xp); > /* mid and side are in Q15, not Q14 like X and Y */ > @@ -1348,7 +1348,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm > if (resynth) > { > if (N!=2) > - stereo_merge(X, Y, mid, N); > + stereo_merge(X, Y, mid, N, ctx->arch); > if (inv) > { > int j; > diff --git a/celt/celt.c b/celt/celt.c > index a610de4..40c62ce 100644 > --- a/celt/celt.c > +++ b/celt/celt.c > @@ -89,10 +89,12 @@ int resampling_factor(opus_int32 rate) > return ret; > } > > -#ifndef OVERRIDE_COMB_FILTER_CONST > /* This version should be faster on ARM */ > #ifdef OPUS_ARM_ASM > -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, > +#ifndef NON_STATIC_COMB_FILTER_CONST_C > +static > +#endif > +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, > opus_val16 g10, opus_val16 g11, opus_val16 g12) > { > opus_val32 x0, x1, x2, x3, x4; > @@ -147,7 +149,10 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, > #endif > } > #else > -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, > +#ifndef NON_STATIC_COMB_FILTER_CONST_C > +static > +#endif > +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, > opus_val16 g10, opus_val16 g11, opus_val16 g12) > { > opus_val32 x0, x1, x2, x3, x4; > @@ -171,12 +176,11 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, > > } > #endif > -#endif > > #ifndef OVERRIDE_comb_filter > void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, > opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, > - const opus_val16 *window, int overlap) > + const opus_val16 *window, int overlap, int arch) > { > int i; > /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */ > @@ -234,7 +238,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, > } > > /* Compute the part with the constant filter. */ > - comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12); > + comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch); > } > #endif /* OVERRIDE_comb_filter */ > > diff --git a/celt/celt.h b/celt/celt.h > index b196751..a423b95 100644 > --- a/celt/celt.h > +++ b/celt/celt.h > @@ -201,7 +201,17 @@ void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RES > > void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, > opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, > - const opus_val16 *window, int overlap); > + const opus_val16 *window, int overlap, int arch); > + > +#ifdef NON_STATIC_COMB_FILTER_CONST_C > +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, > + opus_val16 g10, opus_val16 g11, opus_val16 g12); > +#endif > + > +#ifndef OVERRIDE_COMB_FILTER_CONST > +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ > + ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) > +#endif > > void init_caps(const CELTMode *m,int *cap,int LM,int C); > > diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c > index 304f334..505a6ef 100644 > --- a/celt/celt_decoder.c > +++ b/celt/celt_decoder.c > @@ -699,7 +699,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) > comb_filter(etmp, buf+DECODE_BUFFER_SIZE, > st->postfilter_period, st->postfilter_period, overlap, > -st->postfilter_gain, -st->postfilter_gain, > - st->postfilter_tapset, st->postfilter_tapset, NULL, 0); > + st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); > > /* Simulate TDAC on the concealed audio so that it blends with the > MDCT of the next frame. */ > @@ -1011,11 +1011,11 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat > st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD); > comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize, > st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset, > - mode->window, overlap); > + mode->window, overlap, st->arch); > if (LM!=0) > comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize, > st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset, > - mode->window, overlap); > + mode->window, overlap, st->arch); > > } while (++c<CC); > st->postfilter_period_old = st->postfilter_period; > diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c > index 5f48638..1c9dbcb 100644 > --- a/celt/celt_encoder.c > +++ b/celt/celt_encoder.c > @@ -1166,11 +1166,11 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, > if (offset) > comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD, > st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain, > - st->prefilter_tapset, st->prefilter_tapset, NULL, 0); > + st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch); > > comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset, > st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1, > - st->prefilter_tapset, prefilter_tapset, mode->window, overlap); > + st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch); > OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap); > > if (N>COMBFILTER_MAXPERIOD) > diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h > index dc8967f..323459e 100644 > --- a/celt/celt_lpc.h > +++ b/celt/celt_lpc.h > @@ -48,7 +48,7 @@ void celt_fir_c( > opus_val16 *mem, > int arch); > > -#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#if !defined(OVERRIDE_CELT_FIR) > #define celt_fir(x, num, y, N, ord, mem, arch) \ > (celt_fir_c(x, num, y, N, ord, mem, arch)) > #endif > diff --git a/celt/cpu_support.h b/celt/cpu_support.h > index 1d62e2f..5e99a90 100644 > --- a/celt/cpu_support.h > +++ b/celt/cpu_support.h > @@ -32,7 +32,8 @@ > #include "opus_defines.h" > > #if defined(OPUS_HAVE_RTCD) && \ > - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) > + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) > + > #include "arm/armcpu.h" > > /* We currently support 4 ARM variants: > @@ -43,14 +44,16 @@ > */ > #define OPUS_ARCHMASK 3 > > -#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ > + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ > + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) > > #include "x86/x86cpu.h" > -/* We currently support 3 x86 variants: > +/* We currently support 4 x86 variants: > * arch[0] -> non-sse > - * arch[1] -> sse2 > - * arch[2] -> sse4.1 > - * arch[3] -> NULL > + * arch[1] -> sse > + * arch[2] -> sse2 > + * arch[3] -> sse4.1 > */ > #define OPUS_ARCHMASK 3 > int opus_select_arch(void); > diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h > index 03915d8..7915d59 100644 > --- a/celt/mips/celt_mipsr1.h > +++ b/celt/mips/celt_mipsr1.h > @@ -56,7 +56,7 @@ > #define OVERRIDE_comb_filter > void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, > opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, > - const opus_val16 *window, int overlap) > + const opus_val16 *window, int overlap, int arch) > { > int i; > opus_val32 x0, x1, x2, x3, x4; > diff --git a/celt/pitch.c b/celt/pitch.c > index 4364703..1d89cb0 100644 > --- a/celt/pitch.c > +++ b/celt/pitch.c > @@ -439,7 +439,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, > > T = T0 = *T0_; > ALLOC(yy_lookup, maxperiod+1, opus_val32); > - dual_inner_prod(x, x, x-T0, N, &xx, &xy); > + dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch); > yy_lookup[0] = xx; > yy=xx; > for (i=1;i<=maxperiod;i++) > @@ -483,7 +483,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, > { > T1b = celt_udiv(2*second_check[k]*T0+k, 2*k); > } > - dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2); > + dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch); > xy += xy2; > yy = yy_lookup[T1] + yy_lookup[T1b]; > #ifdef FIXED_POINT > diff --git a/celt/pitch.h b/celt/pitch.h > index 4368cc5..af745eb 100644 > --- a/celt/pitch.h > +++ b/celt/pitch.h > @@ -37,8 +37,8 @@ > #include "modes.h" > #include "cpu_support.h" > > -#if defined(__SSE__) && !defined(FIXED_POINT) \ > - || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) > +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \ > + || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) > #include "x86/pitch_sse.h" > #endif > > @@ -135,8 +135,7 @@ static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * > #endif /* OVERRIDE_XCORR_KERNEL */ > > > -#ifndef OVERRIDE_DUAL_INNER_PROD > -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, > +static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, > int N, opus_val32 *xy1, opus_val32 *xy2) > { > int i; > @@ -150,6 +149,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y > *xy1 = xy01; > *xy2 = xy02; > } > + > +#ifndef OVERRIDE_DUAL_INNER_PROD > +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ > + ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2)) > #endif > > /*We make sure a C version is always available for cases where the overhead of > @@ -169,6 +172,12 @@ static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x, > ((void)(arch),celt_inner_prod_c(x, y, N)) > #endif > > +#ifdef NON_STATIC_COMB_FILTER_CONST_C > +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, > + opus_val16 g10, opus_val16 g11, opus_val16 g12); > +#endif > + > + > #ifdef FIXED_POINT > opus_val32 > #else > @@ -180,7 +189,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, > #if !defined(OVERRIDE_PITCH_XCORR) > /*Is run-time CPU detection enabled on this platform?*/ > # if defined(OPUS_HAVE_RTCD) && \ > - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) > + (defined(OPUS_ARM_ASM) || (defined(OPUS_ARM_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))) > extern > # if defined(FIXED_POINT) > opus_val32 > diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c > index 84f69bd..57691c6 100644 > --- a/celt/tests/test_unit_dft.c > +++ b/celt/tests/test_unit_dft.c > @@ -50,7 +50,7 @@ > #include "entcode.c" > > #if defined(OPUS_HAVE_RTCD) && \ > - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) > + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) > #include "arm/armcpu.c" > #if defined(HAVE_ARM_NE10) > #include "arm/celt_ne10_fft.c" > @@ -60,6 +60,8 @@ > #include "arm/arm_celt_map.c" > #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) > #include "x86/x86cpu.c" > +#include "celt/x86/pitch_sse.c" > +#include "x86/x86_celt_map.c" > #endif > > #ifndef M_PI > diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c > index 0f1e4f1..379fbd5 100644 > --- a/celt/tests/test_unit_mathops.c > +++ b/celt/tests/test_unit_mathops.c > @@ -49,12 +49,21 @@ > #include "cwrs.c" > #include "pitch.c" > #include "celt_lpc.c" > +#include "celt.c" > #include "kiss_fft.c" > #include "mdct.c" > > -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) > +#if defined(OPUS_X86_MAY_HAVE_SSE) || \ > + defined(OPUS_X86_MAY_HAVE_SSE2) || \ > + defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#if defined(OPUS_X86_MAY_HAVE_SSE) > #include "x86/pitch_sse.c" > +#endif > +#if defined(OPUS_X86_MAY_HAVE_SSE2) > +#include "x86/pitch_sse2.c" > +#endif > #if defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#include "x86/pitch_sse4_1.c" > #include "x86/celt_lpc_sse.c" > #endif > #include "x86/x86_celt_map.c" > diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c > index c64cac2..d8c4ef0 100644 > --- a/celt/tests/test_unit_mdct.c > +++ b/celt/tests/test_unit_mdct.c > @@ -49,7 +49,7 @@ > #include "entcode.c" > > #if defined(OPUS_HAVE_RTCD) && \ > - (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR)) > + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) > #include "arm/armcpu.c" > #if defined(HAVE_ARM_NE10) > #include "arm/celt_ne10_fft.c" > @@ -60,6 +60,8 @@ > > #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) > #include "x86/x86cpu.c" > +#include "celt/x86/pitch_sse.c" > +#include "x86/x86_celt_map.c" > #endif > > #ifndef M_PI > diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c > index ce14936..3cf54fa 100644 > --- a/celt/tests/test_unit_rotation.c > +++ b/celt/tests/test_unit_rotation.c > @@ -46,13 +46,22 @@ > #include "bands.h" > #include "pitch.c" > #include "celt_lpc.c" > +#include "celt.c" > #include "kiss_fft.c" > #include "mdct.c" > #include <math.h> > > -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) > +#if defined(OPUS_X86_MAY_HAVE_SSE) || \ > + defined(OPUS_X86_MAY_HAVE_SSE2) || \ > + defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#if defined(OPUS_X86_MAY_HAVE_SSE) > #include "x86/pitch_sse.c" > +#endif > +#if defined(OPUS_X86_MAY_HAVE_SSE2) > +#include "x86/pitch_sse2.c" > +#endif > #if defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#include "x86/pitch_sse4_1.c" > #include "x86/celt_lpc_sse.c" > #endif > #include "x86/x86_celt_map.c" > diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c > index 9fb9779..67e5592 100644 > --- a/celt/x86/celt_lpc_sse.c > +++ b/celt/x86/celt_lpc_sse.c > @@ -38,6 +38,8 @@ > #include "pitch.h" > #include "x86cpu.h" > > +#if defined(FIXED_POINT) > + > void celt_fir_sse4_1(const opus_val16 *_x, > const opus_val16 *num, > opus_val16 *_y, > @@ -126,3 +128,5 @@ void celt_fir_sse4_1(const opus_val16 *_x, > #endif > RESTORE_STACK; > } > + > +#endif > diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h > index f111420..c5ec796 100644 > --- a/celt/x86/celt_lpc_sse.h > +++ b/celt/x86/celt_lpc_sse.h > @@ -32,7 +32,9 @@ > #include "config.h" > #endif > > -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) > +#define OVERRIDE_CELT_FIR > + > void celt_fir_sse4_1( > const opus_val16 *x, > const opus_val16 *num, > @@ -42,6 +44,12 @@ void celt_fir_sse4_1( > opus_val16 *mem, > int arch); > > +#if defined(OPUS_X86_PRESUME_SSE4_1) > +#define celt_fir(x, num, y, N, ord, mem, arch) \ > + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) > + > +#else > + > extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( > const opus_val16 *x, > const opus_val16 *num, > @@ -56,3 +64,5 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( > > #endif > #endif > + > +#endif > diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c > index e3bc6d7..20e7312 100644 > --- a/celt/x86/pitch_sse.c > +++ b/celt/x86/pitch_sse.c > @@ -29,223 +29,157 @@ > #include "config.h" > #endif > > -#include <xmmintrin.h> > -#include <emmintrin.h> > - > #include "macros.h" > #include "celt_lpc.h" > #include "stack_alloc.h" > #include "mathops.h" > #include "pitch.h" > > -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) > -#include <smmintrin.h> > -#include "x86cpu.h" > - > -opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, > - int N) > -{ > - opus_int i, dataSize16; > - opus_int32 sum; > - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; > - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; > - __m128i inVec1_3210, inVec2_3210; > - > - sum = 0; > - dataSize16 = N & ~15; > - > - acc1 = _mm_setzero_si128(); > - acc2 = _mm_setzero_si128(); > - > - for (i=0;i<dataSize16;i+=16) { > - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > - > - inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); > - inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); > - > - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > - inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); > - > - acc1 = _mm_add_epi32(acc1, inVec1_76543210); > - acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); > - } > +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) > > - acc1 = _mm_add_epi32(acc1, acc2); > - > - if (N - i >= 8) > - { > - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > - > - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > - > - acc1 = _mm_add_epi32(acc1, inVec1_76543210); > - i += 8; > - } > - > - if (N - i >= 4) > - { > - inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); > - inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); > - > - inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); > - > - acc1 = _mm_add_epi32(acc1, inVec1_3210); > - i += 4; > - } > - > - acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); > - acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); > - > - sum += _mm_cvtsi128_si32(acc1); > - > - for (;i<N;i++) > - { > - sum = silk_SMLABB(sum, x[i], y[i]); > - } > +#include <xmmintrin.h> > +#include "arch.h" > > - return sum; > +void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) > +{ > + int j; > + __m128 xsum1, xsum2; > + xsum1 = _mm_loadu_ps(sum); > + xsum2 = _mm_setzero_ps(); > + > + for (j = 0; j < len-3; j += 4) > + { > + __m128 x0 = _mm_loadu_ps(x+j); > + __m128 yj = _mm_loadu_ps(y+j); > + __m128 y3 = _mm_loadu_ps(y+j+3); > + > + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); > + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), > + _mm_shuffle_ps(yj,y3,0x49))); > + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), > + _mm_shuffle_ps(yj,y3,0x9e))); > + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); > + } > + if (j < len) > + { > + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); > + if (++j < len) > + { > + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); > + if (++j < len) > + { > + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); > + } > + } > + } > + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); > } > > -void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) > + > +void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, > + int N, opus_val32 *xy1, opus_val32 *xy2) > { > - int j; > - > - __m128i vecX, vecX0, vecX1, vecX2, vecX3; > - __m128i vecY0, vecY1, vecY2, vecY3; > - __m128i sum0, sum1, sum2, sum3, vecSum; > - __m128i initSum; > - > - celt_assert(len >= 3); > - > - sum0 = _mm_setzero_si128(); > - sum1 = _mm_setzero_si128(); > - sum2 = _mm_setzero_si128(); > - sum3 = _mm_setzero_si128(); > - > - for (j=0;j<(len-7);j+=8) > - { > - vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); > - vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); > - vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); > - vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); > - vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); > - > - sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); > - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); > - sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); > - sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); > - } > - > - sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); > - sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); > - > - sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); > - sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); > - > - sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); > - sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); > - > - sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); > - sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); > - > - vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), > - _mm_unpacklo_epi32(sum2, sum3)); > - > - for (;j<(len-3);j+=4) > - { > - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); > - vecX0 = _mm_shuffle_epi32(vecX, 0x00); > - vecX1 = _mm_shuffle_epi32(vecX, 0x55); > - vecX2 = _mm_shuffle_epi32(vecX, 0xaa); > - vecX3 = _mm_shuffle_epi32(vecX, 0xff); > - > - vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); > - vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); > - vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); > - vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); > - > - sum0 = _mm_mullo_epi32(vecX0, vecY0); > - sum1 = _mm_mullo_epi32(vecX1, vecY1); > - sum2 = _mm_mullo_epi32(vecX2, vecY2); > - sum3 = _mm_mullo_epi32(vecX3, vecY3); > - > - sum0 = _mm_add_epi32(sum0, sum1); > - sum2 = _mm_add_epi32(sum2, sum3); > - vecSum = _mm_add_epi32(vecSum, sum0); > - vecSum = _mm_add_epi32(vecSum, sum2); > - } > - > - for (;j<len;j++) > - { > - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); > - vecX0 = _mm_shuffle_epi32(vecX, 0x00); > - > - vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); > - > - sum0 = _mm_mullo_epi32(vecX0, vecY0); > - vecSum = _mm_add_epi32(vecSum, sum0); > - } > - > - initSum = _mm_loadu_si128((__m128i *)(&sum[0])); > - initSum = _mm_add_epi32(initSum, vecSum); > - _mm_storeu_si128((__m128i *)sum, initSum); > + int i; > + __m128 xsum1, xsum2; > + xsum1 = _mm_setzero_ps(); > + xsum2 = _mm_setzero_ps(); > + for (i=0;i<N-3;i+=4) > + { > + __m128 xi = _mm_loadu_ps(x+i); > + __m128 y1i = _mm_loadu_ps(y01+i); > + __m128 y2i = _mm_loadu_ps(y02+i); > + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); > + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); > + } > + /* Horizontal sum */ > + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); > + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); > + _mm_store_ss(xy1, xsum1); > + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); > + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); > + _mm_store_ss(xy2, xsum2); > + for (;i<N;i++) > + { > + *xy1 = MAC16_16(*xy1, x[i], y01[i]); > + *xy2 = MAC16_16(*xy2, x[i], y02[i]); > + } > } > -#endif > > -#if defined(OPUS_X86_MAY_HAVE_SSE2) > -opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, > +opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, > int N) > { > - opus_int i, dataSize16; > - opus_int32 sum; > - > - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; > - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; > - > - sum = 0; > - dataSize16 = N & ~15; > - > - acc1 = _mm_setzero_si128(); > - acc2 = _mm_setzero_si128(); > - > - for (i=0;i<dataSize16;i+=16) > - { > - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > - > - inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); > - inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); > - > - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > - inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); > - > - acc1 = _mm_add_epi32(acc1, inVec1_76543210); > - acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); > - } > - > - acc1 = _mm_add_epi32( acc1, acc2 ); > - > - if (N - i >= 8) > - { > - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > - > - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > + int i; > + float xy; > + __m128 sum; > + sum = _mm_setzero_ps(); > + /* FIXME: We should probably go 8-way and use 2 sums. */ > + for (i=0;i<N-3;i+=4) > + { > + __m128 xi = _mm_loadu_ps(x+i); > + __m128 yi = _mm_loadu_ps(y+i); > + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); > + } > + /* Horizontal sum */ > + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); > + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); > + _mm_store_ss(&xy, sum); > + for (;i<N;i++) > + { > + xy = MAC16_16(xy, x[i], y[i]); > + } > + return xy; > +} > > - acc1 = _mm_add_epi32(acc1, inVec1_76543210); > - i += 8; > - } > +void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, > + opus_val16 g10, opus_val16 g11, opus_val16 g12) > +{ > + int i; > + __m128 x0v; > + __m128 g10v, g11v, g12v; > + g10v = _mm_load1_ps(&g10); > + g11v = _mm_load1_ps(&g11); > + g12v = _mm_load1_ps(&g12); > + x0v = _mm_loadu_ps(&x[-T-2]); > + for (i=0;i<N-3;i+=4) > + { > + __m128 yi, yi2, x1v, x2v, x3v, x4v; > + const opus_val32 *xp = &x[i-T-2]; > + yi = _mm_loadu_ps(x+i); > + x4v = _mm_loadu_ps(xp+4); > +#if 0 > + /* Slower version with all loads */ > + x1v = _mm_loadu_ps(xp+1); > + x2v = _mm_loadu_ps(xp+2); > + x3v = _mm_loadu_ps(xp+3); > +#else > + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); > + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); > + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); > +#endif > > - acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); > - acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); > - sum += _mm_cvtsi128_si32(acc1); > + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); > +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ > + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); > + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); > +#else > + /* Use partial sums */ > + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), > + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); > + yi = _mm_add_ps(yi, yi2); > +#endif > + x0v=x4v; > + _mm_storeu_ps(y+i, yi); > + } > +#ifdef CUSTOM_MODES > + for (;i<N;i++) > + { > + y[i] = x[i] > + + MULT16_32_Q15(g10,x[i-T]) > + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) > + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); > + } > +#endif > +} > > - for (;i<N;i++) { > - sum = silk_SMLABB(sum, x[i], y[i]); > - } > > - return sum; > -} > #endif > diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h > index 99d1919..cbe722c 100644 > --- a/celt/x86/pitch_sse.h > +++ b/celt/x86/pitch_sse.h > @@ -37,17 +37,37 @@ > #include "config.h" > #endif > > -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) > -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) > void xcorr_kernel_sse4_1( > const opus_int16 *x, > const opus_int16 *y, > opus_val32 sum[4], > int len); > +#endif > + > +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) > +void xcorr_kernel_sse( > + const opus_val16 *x, > + const opus_val16 *y, > + opus_val32 sum[4], > + int len); > +#endif > + > +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) > +#define OVERRIDE_XCORR_KERNEL > +#define xcorr_kernel(x, y, sum, len, arch) \ > + ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len)) > + > +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) > +#define OVERRIDE_XCORR_KERNEL > +#define xcorr_kernel(x, y, sum, len, arch) \ > + ((void)arch, xcorr_kernel_sse(x, y, sum, len)) > + > +#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) > > extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( > - const opus_int16 *x, > - const opus_int16 *y, > + const opus_val16 *x, > + const opus_val16 *y, > opus_val32 sum[4], > int len); > > @@ -55,181 +75,115 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( > #define xcorr_kernel(x, y, sum, len, arch) \ > ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) > > +#endif > + > +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) > opus_val32 celt_inner_prod_sse4_1( > const opus_int16 *x, > const opus_int16 *y, > int N); > #endif > > -#if defined(OPUS_X86_MAY_HAVE_SSE2) > +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) > opus_val32 celt_inner_prod_sse2( > const opus_int16 *x, > const opus_int16 *y, > int N); > #endif > > +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) > +opus_val32 celt_inner_prod_sse( > + const opus_val16 *x, > + const opus_val16 *y, > + int N); > +#endif > + > + > +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) > +#define OVERRIDE_CELT_INNER_PROD > +#define celt_inner_prod(x, y, N, arch) \ > + ((void)arch, celt_inner_prod_sse4_1(x, y, N)) > + > +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) > +#define OVERRIDE_CELT_INNER_PROD > +#define celt_inner_prod(x, y, N, arch) \ > + ((void)arch, celt_inner_prod_sse2(x, y, N)) > + > +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) > +#define OVERRIDE_CELT_INNER_PROD > +#define celt_inner_prod(x, y, N, arch) \ > + ((void)arch, celt_inner_prod_sse(x, y, N)) > + > + > +#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ > + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) > + > extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( > - const opus_int16 *x, > - const opus_int16 *y, > + const opus_val16 *x, > + const opus_val16 *y, > int N); > > #define OVERRIDE_CELT_INNER_PROD > #define celt_inner_prod(x, y, N, arch) \ > ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) > -#else > > -#include <xmmintrin.h> > -#include "arch.h" > +#endif > > -#define OVERRIDE_XCORR_KERNEL > -static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) > -{ > - int j; > - __m128 xsum1, xsum2; > - xsum1 = _mm_loadu_ps(sum); > - xsum2 = _mm_setzero_ps(); > - > - for (j = 0; j < len-3; j += 4) > - { > - __m128 x0 = _mm_loadu_ps(x+j); > - __m128 yj = _mm_loadu_ps(y+j); > - __m128 y3 = _mm_loadu_ps(y+j+3); > - > - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); > - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), > - _mm_shuffle_ps(yj,y3,0x49))); > - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), > - _mm_shuffle_ps(yj,y3,0x9e))); > - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); > - } > - if (j < len) > - { > - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); > - if (++j < len) > - { > - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); > - if (++j < len) > - { > - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); > - } > - } > - } > - _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); > -} > - > -#define xcorr_kernel(_x, _y, _z, len, arch) \ > - ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len)) > +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) > > #define OVERRIDE_DUAL_INNER_PROD > -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, > - int N, opus_val32 *xy1, opus_val32 *xy2) > -{ > - int i; > - __m128 xsum1, xsum2; > - xsum1 = _mm_setzero_ps(); > - xsum2 = _mm_setzero_ps(); > - for (i=0;i<N-3;i+=4) > - { > - __m128 xi = _mm_loadu_ps(x+i); > - __m128 y1i = _mm_loadu_ps(y01+i); > - __m128 y2i = _mm_loadu_ps(y02+i); > - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); > - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); > - } > - /* Horizontal sum */ > - xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); > - xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); > - _mm_store_ss(xy1, xsum1); > - xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); > - xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); > - _mm_store_ss(xy2, xsum2); > - for (;i<N;i++) > - { > - *xy1 = MAC16_16(*xy1, x[i], y01[i]); > - *xy2 = MAC16_16(*xy2, x[i], y02[i]); > - } > -} > +#define OVERRIDE_COMB_FILTER_CONST > > -#define OVERRIDE_CELT_INNER_PROD > -static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, > - int N) > -{ > - int i; > - float xy; > - __m128 sum; > - sum = _mm_setzero_ps(); > - /* FIXME: We should probably go 8-way and use 2 sums. */ > - for (i=0;i<N-3;i+=4) > - { > - __m128 xi = _mm_loadu_ps(x+i); > - __m128 yi = _mm_loadu_ps(y+i); > - sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); > - } > - /* Horizontal sum */ > - sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); > - sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); > - _mm_store_ss(&xy, sum); > - for (;i<N;i++) > - { > - xy = MAC16_16(xy, x[i], y[i]); > - } > - return xy; > -} > - > -# define celt_inner_prod(_x, _y, len, arch) \ > - ((void)(arch),celt_inner_prod_sse(_x, _y, len)) > +void dual_inner_prod_sse(const opus_val16 *x, > + const opus_val16 *y01, > + const opus_val16 *y02, > + int N, > + opus_val32 *xy1, > + opus_val32 *xy2); > + > +void comb_filter_const_sse(opus_val32 *y, > + opus_val32 *x, > + int T, > + int N, > + opus_val16 g10, > + opus_val16 g11, > + opus_val16 g12); > + > + > +#if defined(OPUS_X86_PRESUME_SSE) > +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ > + ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) > > #define OVERRIDE_COMB_FILTER_CONST > -static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, > - opus_val16 g10, opus_val16 g11, opus_val16 g12) > -{ > - int i; > - __m128 x0v; > - __m128 g10v, g11v, g12v; > - g10v = _mm_load1_ps(&g10); > - g11v = _mm_load1_ps(&g11); > - g12v = _mm_load1_ps(&g12); > - x0v = _mm_loadu_ps(&x[-T-2]); > - for (i=0;i<N-3;i+=4) > - { > - __m128 yi, yi2, x1v, x2v, x3v, x4v; > - const opus_val32 *xp = &x[i-T-2]; > - yi = _mm_loadu_ps(x+i); > - x4v = _mm_loadu_ps(xp+4); > -#if 0 > - /* Slower version with all loads */ > - x1v = _mm_loadu_ps(xp+1); > - x2v = _mm_loadu_ps(xp+2); > - x3v = _mm_loadu_ps(xp+3); > -#else > - x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); > - x1v = _mm_shuffle_ps(x0v, x2v, 0x99); > - x3v = _mm_shuffle_ps(x2v, x4v, 0x99); > -#endif > > - yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); > -#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ > - yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); > - yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); > #else > - /* Use partial sums */ > - yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), > - _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); > - yi = _mm_add_ps(yi, yi2); > + > +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( > + const opus_val16 *x, > + const opus_val16 *y01, > + const opus_val16 *y02, > + int N, > + opus_val32 *xy1, > + opus_val32 *xy2); > + > +#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ > + ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) > + > +extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( > + opus_val32 *y, > + opus_val32 *x, > + int T, > + int N, > + opus_val16 g10, > + opus_val16 g11, > + opus_val16 g12); > + > +#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ > + ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12)) > + > +#define NON_STATIC_COMB_FILTER_CONST_C > + > #endif > - x0v=x4v; > - _mm_storeu_ps(y+i, yi); > - } > -#ifdef CUSTOM_MODES > - for (;i<N;i++) > - { > - y[i] = x[i] > - + MULT16_32_Q15(g10,x[i-T]) > - + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) > - + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); > - } > #endif > -} > > #endif > -#endif > diff --git a/celt/x86/pitch_sse2.c b/celt/x86/pitch_sse2.c > new file mode 100644 > index 0000000..a0e7d1b > --- /dev/null > +++ b/celt/x86/pitch_sse2.c > @@ -0,0 +1,95 @@ > +/* Copyright (c) 2014, Cisco Systems, INC > + Written by XiangMingZhu WeiZhou MinPeng YanWang > + > + Redistribution and use in source and binary forms, with or without > + modification, are permitted provided that the following conditions > + are met: > + > + - Redistributions of source code must retain the above copyright > + notice, this list of conditions and the following disclaimer. > + > + - Redistributions in binary form must reproduce the above copyright > + notice, this list of conditions and the following disclaimer in the > + documentation and/or other materials provided with the distribution. > + > + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER > + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, > + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, > + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > +*/ > + > +#ifdef HAVE_CONFIG_H > +#include "config.h" > +#endif > + > +#include <xmmintrin.h> > +#include <emmintrin.h> > + > +#include "macros.h" > +#include "celt_lpc.h" > +#include "stack_alloc.h" > +#include "mathops.h" > +#include "pitch.h" > + > +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) > +opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, > + int N) > +{ > + opus_int i, dataSize16; > + opus_int32 sum; > + > + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; > + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; > + > + sum = 0; > + dataSize16 = N & ~15; > + > + acc1 = _mm_setzero_si128(); > + acc2 = _mm_setzero_si128(); > + > + for (i=0;i<dataSize16;i+=16) > + { > + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > + > + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); > + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); > + > + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); > + > + acc1 = _mm_add_epi32(acc1, inVec1_76543210); > + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); > + } > + > + acc1 = _mm_add_epi32( acc1, acc2 ); > + > + if (N - i >= 8) > + { > + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > + > + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > + > + acc1 = _mm_add_epi32(acc1, inVec1_76543210); > + i += 8; > + } > + > + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); > + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); > + sum += _mm_cvtsi128_si32(acc1); > + > + for (;i<N;i++) { > + sum = silk_SMLABB(sum, x[i], y[i]); > + } > + > + return sum; > +} > +#endif > diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c > new file mode 100644 > index 0000000..a092c68 > --- /dev/null > +++ b/celt/x86/pitch_sse4_1.c > @@ -0,0 +1,195 @@ > +/* Copyright (c) 2014, Cisco Systems, INC > + Written by XiangMingZhu WeiZhou MinPeng YanWang > + > + Redistribution and use in source and binary forms, with or without > + modification, are permitted provided that the following conditions > + are met: > + > + - Redistributions of source code must retain the above copyright > + notice, this list of conditions and the following disclaimer. > + > + - Redistributions in binary form must reproduce the above copyright > + notice, this list of conditions and the following disclaimer in the > + documentation and/or other materials provided with the distribution. > + > + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER > + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, > + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, > + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > +*/ > + > +#ifdef HAVE_CONFIG_H > +#include "config.h" > +#endif > + > +#include <xmmintrin.h> > +#include <emmintrin.h> > + > +#include "macros.h" > +#include "celt_lpc.h" > +#include "stack_alloc.h" > +#include "mathops.h" > +#include "pitch.h" > + > +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) > +#include <smmintrin.h> > +#include "x86cpu.h" > + > +opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, > + int N) > +{ > + opus_int i, dataSize16; > + opus_int32 sum; > + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; > + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; > + __m128i inVec1_3210, inVec2_3210; > + > + sum = 0; > + dataSize16 = N & ~15; > + > + acc1 = _mm_setzero_si128(); > + acc2 = _mm_setzero_si128(); > + > + for (i=0;i<dataSize16;i+=16) { > + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > + > + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); > + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); > + > + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); > + > + acc1 = _mm_add_epi32(acc1, inVec1_76543210); > + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); > + } > + > + acc1 = _mm_add_epi32(acc1, acc2); > + > + if (N - i >= 8) > + { > + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); > + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); > + > + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); > + > + acc1 = _mm_add_epi32(acc1, inVec1_76543210); > + i += 8; > + } > + > + if (N - i >= 4) > + { > + inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); > + inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); > + > + inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); > + > + acc1 = _mm_add_epi32(acc1, inVec1_3210); > + i += 4; > + } > + > + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); > + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); > + > + sum += _mm_cvtsi128_si32(acc1); > + > + for (;i<N;i++) > + { > + sum = silk_SMLABB(sum, x[i], y[i]); > + } > + > + return sum; > +} > + > +void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) > +{ > + int j; > + > + __m128i vecX, vecX0, vecX1, vecX2, vecX3; > + __m128i vecY0, vecY1, vecY2, vecY3; > + __m128i sum0, sum1, sum2, sum3, vecSum; > + __m128i initSum; > + > + celt_assert(len >= 3); > + > + sum0 = _mm_setzero_si128(); > + sum1 = _mm_setzero_si128(); > + sum2 = _mm_setzero_si128(); > + sum3 = _mm_setzero_si128(); > + > + for (j=0;j<(len-7);j+=8) > + { > + vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); > + vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); > + vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); > + vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); > + vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); > + > + sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); > + sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); > + sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); > + sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); > + } > + > + sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); > + sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); > + > + sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); > + sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); > + > + sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); > + sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); > + > + sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); > + sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); > + > + vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), > + _mm_unpacklo_epi32(sum2, sum3)); > + > + for (;j<(len-3);j+=4) > + { > + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); > + vecX0 = _mm_shuffle_epi32(vecX, 0x00); > + vecX1 = _mm_shuffle_epi32(vecX, 0x55); > + vecX2 = _mm_shuffle_epi32(vecX, 0xaa); > + vecX3 = _mm_shuffle_epi32(vecX, 0xff); > + > + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); > + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); > + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); > + vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); > + > + sum0 = _mm_mullo_epi32(vecX0, vecY0); > + sum1 = _mm_mullo_epi32(vecX1, vecY1); > + sum2 = _mm_mullo_epi32(vecX2, vecY2); > + sum3 = _mm_mullo_epi32(vecX3, vecY3); > + > + sum0 = _mm_add_epi32(sum0, sum1); > + sum2 = _mm_add_epi32(sum2, sum3); > + vecSum = _mm_add_epi32(vecSum, sum0); > + vecSum = _mm_add_epi32(vecSum, sum2); > + } > + > + for (;j<len;j++) > + { > + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); > + vecX0 = _mm_shuffle_epi32(vecX, 0x00); > + > + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); > + > + sum0 = _mm_mullo_epi32(vecX0, vecY0); > + vecSum = _mm_add_epi32(vecSum, sum0); > + } > + > + initSum = _mm_loadu_si128((__m128i *)(&sum[0])); > + initSum = _mm_add_epi32(initSum, vecSum); > + _mm_storeu_si128((__m128i *)sum, initSum); > +} > +#endif > diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c > index 83410db..1ed2acb 100644 > --- a/celt/x86/x86_celt_map.c > +++ b/celt/x86/x86_celt_map.c > @@ -38,6 +38,8 @@ > > # if defined(FIXED_POINT) > > +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1) > + > void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( > const opus_val16 *x, > const opus_val16 *num, > @@ -49,8 +51,8 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( > ) = { > celt_fir_c, /* non-sse */ > celt_fir_c, > + celt_fir_c, > MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ > - NULL > }; > > void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( > @@ -61,24 +63,86 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( > ) = { > xcorr_kernel_c, /* non-sse */ > xcorr_kernel_c, > + xcorr_kernel_c, > MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ > - NULL > }; > > +#endif > + > +#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ > + (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) > + > opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( > const opus_val16 *x, > const opus_val16 *y, > int N > ) = { > celt_inner_prod_c, /* non-sse */ > + celt_inner_prod_c, > MAY_HAVE_SSE2(celt_inner_prod), > MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ > - NULL > }; > > +#endif > + > # else > -# error "Floating-point implementation is not supported by x86 RTCD yet." \ > - "Reconfigure with --disable-rtcd or send patches." > -# endif > > +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) > + > +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( > + const opus_val16 *x, > + const opus_val16 *y, > + opus_val32 sum[4], > + int len > +) = { > + xcorr_kernel_c, /* non-sse */ > + MAY_HAVE_SSE(xcorr_kernel), > + MAY_HAVE_SSE(xcorr_kernel), > + MAY_HAVE_SSE(xcorr_kernel), > +}; > + > +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( > + const opus_val16 *x, > + const opus_val16 *y, > + int N > +) = { > + celt_inner_prod_c, /* non-sse */ > + MAY_HAVE_SSE(celt_inner_prod), > + MAY_HAVE_SSE(celt_inner_prod), > + MAY_HAVE_SSE(celt_inner_prod), > +}; > + > +void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( > + const opus_val16 *x, > + const opus_val16 *y01, > + const opus_val16 *y02, > + int N, > + opus_val32 *xy1, > + opus_val32 *xy2 > +) = { > + dual_inner_prod_c, /* non-sse */ > + MAY_HAVE_SSE(dual_inner_prod), > + MAY_HAVE_SSE(dual_inner_prod), > + MAY_HAVE_SSE(dual_inner_prod), > +}; > + > +void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( > + opus_val32 *y, > + opus_val32 *x, > + int T, > + int N, > + opus_val16 g10, > + opus_val16 g11, > + opus_val16 g12 > +) = { > + comb_filter_const_c, /* non-sse */ > + MAY_HAVE_SSE(comb_filter_const), > + MAY_HAVE_SSE(comb_filter_const), > + MAY_HAVE_SSE(comb_filter_const), > +}; > + > + > +#endif > + > +#endif > #endif > diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c > index c82a4b7..afcdeb6 100644 > --- a/celt/x86/x86cpu.c > +++ b/celt/x86/x86cpu.c > @@ -35,10 +35,19 @@ > #include "pitch.h" > #include "x86cpu.h" > > +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ > + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ > + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) > + > + > #if defined(_MSC_VER) > > #include <intrin.h> > -#define cpuid(info,x) __cpuid(info,x) > +static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) > +{ > + __cpuid((int*)CPUInfo, InfoType); > +} > + > #else > > #if defined(CPU_INFO_BY_C) > @@ -48,14 +57,28 @@ > static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) > { > #if defined(CPU_INFO_BY_ASM) > +#if defined(__i386__) && defined(__PIC__) > +/* %ebx is PIC register in 32-bit, so mustn't clobber it. */ > + __asm__ __volatile__ ( > + "xchg %%ebx, %1\n" > + "cpuid\n" > + "xchg %%ebx, %1\n": > + "=a" (CPUInfo[0]), > + "=r" (CPUInfo[1]), > + "=c" (CPUInfo[2]), > + "=d" (CPUInfo[3]) : > + "0" (InfoType) > + ); > +#else > __asm__ __volatile__ ( > "cpuid": > "=a" (CPUInfo[0]), > "=b" (CPUInfo[1]), > "=c" (CPUInfo[2]), > "=d" (CPUInfo[3]) : > - "a" (InfoType), "c" (0) > + "0" (InfoType) > ); > +#endif > #elif defined(CPU_INFO_BY_C) > __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); > #endif > @@ -63,11 +86,9 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) > > #endif > > -#include "SigProc_FIX.h" > -#include "celt_lpc.h" > - > typedef struct CPU_Feature{ > /* SIMD: 128-bit */ > + int HW_SSE; > int HW_SSE2; > int HW_SSE41; > } CPU_Feature; > @@ -82,19 +103,31 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) > > if (nIds >= 1){ > cpuid(info, 1); > + cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0; > cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; > cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; > } > + else { > + cpu_feature->HW_SSE = 0; > + cpu_feature->HW_SSE2 = 0; > + cpu_feature->HW_SSE41 = 0; > + } > } > > int opus_select_arch(void) > { > - CPU_Feature cpu_feature = {0}; > + CPU_Feature cpu_feature; > int arch; > > opus_cpu_feature_check(&cpu_feature); > > arch = 0; > + if (!cpu_feature.HW_SSE) > + { > + return arch; > + } > + arch++; > + > if (!cpu_feature.HW_SSE2) > { > return arch; > @@ -109,3 +142,5 @@ int opus_select_arch(void) > > return arch; > } > + > +#endif > diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h > index ef53f0c..7f4c61d 100644 > --- a/celt/x86/x86cpu.h > +++ b/celt/x86/x86cpu.h > @@ -28,6 +28,12 @@ > #if !defined(X86CPU_H) > # define X86CPU_H > > +# if defined(OPUS_X86_MAY_HAVE_SSE) > +# define MAY_HAVE_SSE(name) name ## _sse > +# else > +# define MAY_HAVE_SSE(name) name ## _c > +# endif > + > # if defined(OPUS_X86_MAY_HAVE_SSE2) > # define MAY_HAVE_SSE2(name) name ## _sse2 > # else > @@ -55,21 +61,25 @@ int opus_select_arch(void); > reference in the PMOVSXWD instruction itself, but gcc is not smart enough to > optimize this out when optimizations ARE enabled. > > - It appears clang requires us to do this always (which is fair, since > - technically the compiler is always allowed to do the dereference before > - invoking the function implementing the intrinsic). I have not investiaged > - whether it is any smarter than gcc when it comes to eliminating the extra > - load instruction.*/ > + Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 > + (which is fair, since technically the compiler is always allowed to do the > + dereference before invoking the function implementing the intrinsic). > + However, it is smart enough to eliminate the extra MOVD instruction. > + For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out > + the extra MOVQ if it's specified explicitly */ > + > # if defined(__clang__) || !defined(__OPTIMIZE__) > # define OP_CVTEPI8_EPI32_M32(x) \ > (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) > - > -# define OP_CVTEPI16_EPI32_M64(x) \ > - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) > # else > # define OP_CVTEPI8_EPI32_M32(x) \ > (_mm_cvtepi8_epi32(*(__m128i *)(x))) > +#endif > > +# if !defined(__OPTIMIZE__) > +# define OP_CVTEPI16_EPI32_M64(x) \ > + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) > +# else > # define OP_CVTEPI16_EPI32_M64(x) \ > (_mm_cvtepi16_epi32(*(__m128i *)(x))) > # endif > diff --git a/celt_sources.mk b/celt_sources.mk > index 7121301..2ffe99a 100644 > --- a/celt_sources.mk > +++ b/celt_sources.mk > @@ -21,7 +21,10 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \ > celt/x86/x86_celt_map.c \ > celt/x86/pitch_sse.c > > -CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c > +CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c > + > +CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \ > +celt/x86/pitch_sse4_1.c > > CELT_SOURCES_ARM = \ > celt/arm/armcpu.c \ > diff --git a/configure.ac b/configure.ac > index baa3425..9b05fc1 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -348,8 +348,24 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], > AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], > [test x"${asm_optimization%% *}" = x"ARM"]) > > -AM_CONDITIONAL([HAVE_SSE4_1], [false]) > +AM_CONDITIONAL([HAVE_SSE], [false]) > AM_CONDITIONAL([HAVE_SSE2], [false]) > +AM_CONDITIONAL([HAVE_SSE4_1], [false]) > + > +m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse]) > +m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2]) > +m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1]) > +m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon]) > + > +AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics @<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@]) > +AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics @<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@]) > +AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@]) > +AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS[@:>@]) > + > +AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS], DEFAULT_X86_SSE_CFLAGS)]) > +AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS], DEFAULT_X86_SSE2_CFLAGS)]) > +AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS], DEFAULT_X86_SSE4_1_CFLAGS)]) > +AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS], DEFAULT_ARM_NEON_INTR_CFLAGS)]) > > AC_DEFUN([OPUS_PATH_NE10], > [ > @@ -426,45 +442,53 @@ AC_DEFUN([OPUS_PATH_NE10], > ) > > AS_IF([test x"$enable_intrinsics" = x"yes"],[ > - case $host_cpu in > - arm*) > + intrinsics_support="" > + AS_CASE([$host_cpu], > + [arm*], > + [ > cpu_arm=yes > - AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics) > - save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS" > - AC_LINK_IFELSE( > - [ > - AC_LANG_PROGRAM( > - [[#include <arm_neon.h> > - ]], > - [[ > - static float32x4_t A[2], SUMM; > - SUMM = vmlaq_f32(SUMM, A[0], A[1]); > - ]] > - ) > - ],[ > - OPUS_ARM_NEON_INTR=1 > - AC_MSG_RESULT([yes]) > - ],[ > - OPUS_ARM_NEON_INTR=0 > - AC_MSG_RESULT([no]) > - ] > + OPUS_CHECK_INTRINSICS( > + [ARM Neon], > + [$ARM_NEON_INTR_CFLAGS], > + [OPUS_ARM_MAY_HAVE_NEON_INTR], > + [OPUS_ARM_PRESUME_NEON_INTR], > + [[#include <arm_neon.h> > + ]], > + [[ > + static float32x4_t A0, A1, SUMM; > + SUMM = vmlaq_f32(SUMM, A0, A1); > + ]] > + ) > + AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], > + [ > + OPUS_ARM_NEON_INTR_CFLAGS="$ARM_NEON_INTR_CFLAGS" > + AC_SUBST([OPUS_ARM_NEON_INTR_CFLAGS]) > + ] > ) > - CFLAGS="$save_CFLAGS" > - #Now we know if compiler supports ARM neon intrinsics or not > > - #Currently we only have intrinsic optimization for floating point > + #Currently we only have intrinsic optimizations for floating point > AS_IF([test x"$enable_float" = x"yes"], > [ > - AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], > + AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"], > [ > - AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics]) > - AS_IF([test x"enable_rtcd" != x""], > - [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[]) > - enable_intrinsics="$enable_intrinsics ARMv7_Neon_Intrinsics" > - dnl Don't see why defining these is necessary to check features at runtime > - AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support EDSP Instructions]) > - AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support MEDIA Instructions]) > - AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions]) > + AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1, > + [Compiler supports ARMv7 Neon Intrinsics]) > + intrinsics_support="$intrinsics_support (Neon_Intrinsics)" > + > + AS_IF([test x"enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], > + [rtcd_support="$rtcd_support (ARMv7_Neon_Intrinsics)"],[]) > + > + AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" = x"1"], > + [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1, > + [Define if binary requires NEON intrinsics support])]) > + > + AS_IF([test x"$rtcd_support" = x""], > + [rtcd_support=no]) > + > + AS_IF([test x"$intrinsics_support" = x""], > + [intrinsics_support=no], > + [intrinsics_support="arm$intrinsics_support"]) > + > > OPUS_PATH_NE10() > AS_IF([test x"$NE10_LIBS" != "x"], > @@ -472,18 +496,122 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ > ], > [ > AC_MSG_WARN([Compiler does not support ARM intrinsics]) > - enable_intrinsics=no > + intrinsics_support=no > ]) > ], [ > - AC_MSG_WARN([Currently on have ARM intrinsics for float]) > - enable_intrinsics=no > + AC_MSG_WARN([Currently only have ARM intrinsics for float]) > + intrinsics_support=no > ]) > - ;; > - "i386" | "i686" | "x86_64") > - AS_IF([test x"$enable_float" = x"no"],[ > - AS_IF([test x"$enable_rtcd" = x"yes"],[ > + ], > + [i?86|x86_64], > + [ > + OPUS_CHECK_INTRINSICS( > + [SSE], > + [$X86_SSE_CFLAGS], > + [OPUS_X86_MAY_HAVE_SSE], > + [OPUS_X86_PRESUME_SSE], > + [[#include <xmmintrin.h> > + ]], > + [[ > + static __m128 mtest; > + mtest = _mm_setzero_ps(); > + ]] > + ) > + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1" && test x"$OPUS_X86_PRESUME_SSE" != x"1"], > + [ > + OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS" > + AC_SUBST([OPUS_X86_SSE_CFLAGS]) > + ] > + ) > + OPUS_CHECK_INTRINSICS( > + [SSE2], > + [$X86_SSE2_CFLAGS], > + [OPUS_X86_MAY_HAVE_SSE2], > + [OPUS_X86_PRESUME_SSE2], > + [[#include <emmintrin.h> > + ]], > + [[ > + static __m128i mtest; > + mtest = _mm_setzero_si128(); > + ]] > + ) > + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1" && test x"$OPUS_X86_PRESUME_SSE2" != x"1"], > + [ > + OPUS_X86_SSE2_CFLAGS="$X86_SSE2_CFLAGS" > + AC_SUBST([OPUS_X86_SSE2_CFLAGS]) > + ] > + ) > + OPUS_CHECK_INTRINSICS( > + [SSE4.1], > + [$X86_SSE4_1_CFLAGS], > + [OPUS_X86_MAY_HAVE_SSE4_1], > + [OPUS_X86_PRESUME_SSE4_1], > + [[#include <smmintrin.h> > + ]], > + [[ > + static __m128i mtest; > + mtest = _mm_setzero_si128(); > + mtest = _mm_cmpeq_epi64(mtest, mtest); > + ]] > + ) > + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1" && test x"$OPUS_X86_PRESUME_SSE4_1" != x"1"], > + [ > + OPUS_X86_SSE4_1_CFLAGS="$X86_SSE4_1_CFLAGS" > + AC_SUBST([OPUS_X86_SSE4_1_CFLAGS]) > + ] > + ) > + > + AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""]) > + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"], > + [ > + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE], 1, [Compiler supports X86 SSE Intrinsics]) > + intrinsics_support="$intrinsics_support SSE" > + > + AS_IF([test x"$OPUS_X86_PRESUME_SSE" = x"1"], > + [AC_DEFINE([OPUS_X86_PRESUME_SSE], 1, [Define if binary requires SSE intrinsics support])], > + [rtcd_support="$rtcd_support SSE"]) > + ], > + [ > + AC_MSG_WARN([Compiler does not support SSE intrinsics]) > + ]) > + > + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"], > + [ > + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], 1, [Compiler supports X86 SSE2 Intrinsics]) > + intrinsics_support="$intrinsics_support SSE2" > + > + AS_IF([test x"$OPUS_X86_PRESUME_SSE2" = x"1"], > + [AC_DEFINE([OPUS_X86_PRESUME_SSE2], 1, [Define if binary requires SSE2 intrinsics support])], > + [rtcd_support="$rtcd_support SSE2"]) > + ], > + [ > + AC_MSG_WARN([Compiler does not support SSE2 intrinsics]) > + ]) > + > + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"], > + [ > + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], 1, [Compiler supports X86 SSE4.1 Intrinsics]) > + intrinsics_support="$intrinsics_support SSE4.1" > + > + AS_IF([test x"$OPUS_X86_PRESUME_SSE4_1" = x"1"], > + [AC_DEFINE([OPUS_X86_PRESUME_SSE4_1], 1, [Define if binary requires SSE4.1 intrinsics support])], > + [rtcd_support="$rtcd_support SSE4.1"]) > + ], > + [ > + AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics]) > + ]) > + AS_IF([test x"$intrinsics_support" = x""], > + [intrinsics_support=no], > + [intrinsics_support="x86$intrinsics_support"] > + ) > + AS_IF([test x"$rtcd_support" = x""], > + [rtcd_support=no], > + [rtcd_support="x86$rtcd_support"], > + ) > + > + AS_IF([test x"$enable_rtcd" = x"yes" && test x"$rtcd_support" != x""],[ > get_cpuid_by_asm="no" > - AC_MSG_CHECKING([Get CPU Info]) > + AC_MSG_CHECKING([How to get X86 CPU Info]) > AC_LINK_IFELSE([AC_LANG_PROGRAM([[ > #include <stdio.h> > ]],[[ > @@ -493,7 +621,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ > unsigned int CPUInfo3; > unsigned int InfoType; > __asm__ __volatile__ ( > - "cpuid11": > + "cpuid": > "=a" (CPUInfo0), > "=b" (CPUInfo1), > "=c" (CPUInfo2), > @@ -502,7 +630,8 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ > ); > ]])], > [get_cpuid_by_asm="yes" > - AC_MSG_RESULT([Inline Assembly])], > + AC_MSG_RESULT([Inline Assembly]) > + AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], > [AC_LINK_IFELSE([AC_LANG_PROGRAM([[ > #include <cpuid.h> > ]],[[ > @@ -513,90 +642,31 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ > unsigned int InfoType; > __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3); > ]])], > - [AC_MSG_RESULT([C method])], > - [AC_MSG_ERROR([not support Get CPU Info, please disable intrinsics ])])]) > - > - AC_MSG_CHECKING([sse4.1]) > - TMP_CFLAGS="$CFLAGS" > - gcc -Q --help=target | grep "\-msse4.1 " > - AS_IF([test x"$?" = x"0"],[ > - CFLAGS="$CFLAGS -msse4.1" > - AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find xmmintrin.h])]) > - AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find emmintrin.h])]) > - AC_CHECK_HEADER(smmintrin.h, [], [AC_MSG_ERROR([Couldn't find smmintrin.h])],[ > - #ifdef HAVE_XMMINSTRIN_H > - #include <xmmintrin.h> > - #endif > - #ifdef HAVE_EMMINSTRIN_H > - #include <emmintrin.h> > - #endif > - ]) > - > - AC_LINK_IFELSE([AC_LANG_PROGRAM([[ > - #include <xmmintrin.h> > - #include <emmintrin.h> > - #include <smmintrin.h> > - ]],[[ > - __m128i mtest = _mm_setzero_si128(); > - mtest = _mm_cmpeq_epi64(mtest, mtest); > - ]])], > - [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker failure for sse4.1, please disable intrinsics])]) > - > - CFLAGS="$TMP_CFLAGS" > - AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], [1], [For x86 sse4.1 instrinsics optimizations]) > - AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics optimizations]) > - rtcd_support="x86 sse4.1" > - AM_CONDITIONAL([HAVE_SSE4_1], [true]) > - AM_CONDITIONAL([HAVE_SSE2], [true]) > - AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], > - [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])]) > - ],[ ##### Else case for AS_IF([test x"$?" = x"0"]) > - gcc -Q --help=target | grep "\-msse2 " > - AC_MSG_CHECKING([sse2]) > - AS_IF([test x"$?" = x"0"],[ > - AC_MSG_RESULT([yes]) > - CFLAGS="$CFLAGS -msse2" > - AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find xmmintrin.h])]) > - AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find emmintrin.h])]) > - > - AC_LINK_IFELSE([AC_LANG_PROGRAM([[ > - #include <xmmintrin.h> > - #include <emmintrin.h> > - ]],[[ > - __m128i mtest = _mm_setzero_si128(); > - ]])], > - [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker failure for sse2, please disable intrinsics])]) > - > - CFLAGS="$TMP_CFLAGS" > - AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics optimize]) > - rtcd_support="x86 sse2" > - AM_CONDITIONAL([HAVE_SSE2], [true]) > - AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])], > - [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])]) > - ],[enable_intrinsics="no"]) #End of AS_IF([test x"$?" = x"0"] > - ]) > - ], [ > - enable_intrinsics="no" > - ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"] > -], > -[ ## Else case for AS_IF([test x"$enable_float" = x"no"] > - AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed point]) > - enable_intrinsics="no" > -]) ## End of AS_IF([test x"$enable_float" = x"no"] > - ;; > - *) > + [AC_MSG_RESULT([C method]) > + AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])], > + [AC_MSG_ERROR([no supported Get CPU Info method, please disable intrinsics])])])]) > + ], > + [ > AC_MSG_WARN([No intrinsics support for your architecture]) > - enable_intrinsics="no" > - ;; > - esac > + intrinsics_support="no" > + ]) > +], > +[ > + intrinsics_support="no" > ]) > > AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) > AM_CONDITIONAL([OPUS_ARM_NEON_INTR], > - [test x"$OPUS_ARM_NEON_INTR" = x"1"]) > + [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"]) > AM_CONDITIONAL([HAVE_ARM_NE10], > [test x"$HAVE_ARM_NE10" = x"1"]) > > +AM_CONDITIONAL([HAVE_SSE], > + [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"]) > +AM_CONDITIONAL([HAVE_SSE2], > + [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"]) > +AM_CONDITIONAL([HAVE_SSE4_1], > + [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"]) > > AS_IF([test x"$enable_rtcd" = x"yes"],[ > AS_IF([test x"$rtcd_support" != x"no"],[ > @@ -704,7 +774,7 @@ AC_MSG_NOTICE([ > Fixed point debugging: ......... ${enable_fixed_point_debug} > Inline Assembly Optimizations: . ${inline_optimization} > External Assembly Optimizations: ${asm_optimization} > - Intrinsics Optimizations.......: ${enable_intrinsics} > + Intrinsics Optimizations.......: ${intrinsics_support} > Run-time CPU detection: ........ ${rtcd_support} > Custom modes: .................. ${enable_custom_modes} > Assertion checking: ............ ${enable_assertions} > diff --git a/m4/opus-intrinsics.m4 b/m4/opus-intrinsics.m4 > new file mode 100644 > index 0000000..c74aecd > --- /dev/null > +++ b/m4/opus-intrinsics.m4 > @@ -0,0 +1,29 @@ > +dnl opus-intrinsics.m4 > +dnl macro for testing for support for compiler intrinsics, either by default or with a compiler flag > + > +dnl OPUS_CHECK_INTRINSICS(NAME-OF-INTRINSICS, COMPILER-FLAG-FOR-INTRINSICS, VAR-IF-PRESENT, VAR-IF-DEFAULT, TEST-PROGRAM-HEADER, TEST-PROGRAM-BODY) > +AC_DEFUN([OPUS_CHECK_INTRINSICS], > +[ > + AC_MSG_CHECKING([if compiler supports $1 intrinsics]) > + AC_LINK_IFELSE( > + [AC_LANG_PROGRAM($5, $6)], > + [ > + $3=1 > + $4=1 > + AC_MSG_RESULT([yes]) > + ],[ > + $4=0 > + AC_MSG_RESULT([no]) > + AC_MSG_CHECKING([if compiler supports $1 intrinsics with $2]) > + save_CFLAGS="$CFLAGS"; CFLAGS="$2 $CFLAGS" > + AC_LINK_IFELSE([AC_LANG_PROGRAM($5, $6)], > + [ > + AC_MSG_RESULT([yes]) > + $3=1 > + ],[ > + AC_MSG_RESULT([no]) > + $3=0 > + ]) > + CFLAGS="$save_CFLAGS" > + ]) > +]) > diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h > index 9a0e096..61efa8d 100644 > --- a/silk/x86/SigProc_FIX_sse.h > +++ b/silk/x86/SigProc_FIX_sse.h > @@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1( > int arch /* I Run-time architecture */ > ); > > +#if defined(OPUS_X86_PRESUME_SSE4_1) > +#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ > + ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) > + > +#else > + > extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( > opus_int32 *res_nrg, /* O Residual energy */ > opus_int *res_nrg_Q, /* O Residual energy Q value */ > @@ -59,12 +65,22 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( > # define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ > ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) > > +#endif > + > opus_int64 silk_inner_prod16_aligned_64_sse4_1( > const opus_int16 *inVec1, > const opus_int16 *inVec2, > const opus_int len > ); > > + > +#if defined(OPUS_X86_PRESUME_SSE4_1) > + > +#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ > + ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len)) > + > +#else > + > extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( > const opus_int16 *inVec1, > const opus_int16 *inVec2, > @@ -75,3 +91,4 @@ extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( > > #endif > #endif > +#endif > diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h > index f970632..afd5ec2 100644 > --- a/silk/x86/main_sse.h > +++ b/silk/x86/main_sse.h > @@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1( > opus_int L /* I number of vectors in codebook */ > ); > > +#if defined OPUS_X86_PRESUME_SSE4_1 > + > +#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ > + mu_Q9, max_gain_Q7, L, arch) \ > + ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ > + mu_Q9, max_gain_Q7, L)) > + > +#else > + > extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( > opus_int8 *ind, /* O index of best codebook vector */ > opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ > @@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( > ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ > mu_Q9, max_gain_Q7, L)) > > +#endif > + > # define OVERRIDE_silk_NSQ > > void silk_NSQ_sse4_1( > @@ -89,6 +100,15 @@ void silk_NSQ_sse4_1( > const opus_int LTP_scale_Q14 /* I LTP state scaling */ > ); > > +#if defined OPUS_X86_PRESUME_SSE4_1 > + > +#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ > + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ > + ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ > + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) > + > +#else > + > extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( > const silk_encoder_state *psEncC, /* I/O Encoder State */ > silk_nsq_state *NSQ, /* I/O NSQ state */ > @@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( > ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ > HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) > > +#endif > + > # define OVERRIDE_silk_NSQ_del_dec > > void silk_NSQ_del_dec_sse4_1( > @@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1( > const opus_int LTP_scale_Q14 /* I LTP state scaling */ > ); > > +#if defined OPUS_X86_PRESUME_SSE4_1 > + > +#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ > + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ > + ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ > + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) > + > +#else > + > extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( > const silk_encoder_state *psEncC, /* I/O Encoder State */ > silk_nsq_state *NSQ, /* I/O NSQ state */ > @@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( > ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ > HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) > > +#endif > + > void silk_noise_shape_quantizer( > silk_nsq_state *NSQ, /* I/O NSQ state */ > opus_int signalType, /* I Signal type */ > @@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( > const opus_int16 pIn[] > ); > > +#if defined(OPUS_X86_PRESUME_SSE4_1) > +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) > + > +#else > + > # define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ > ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) > > @@ -201,6 +239,8 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( > > # define OVERRIDE_silk_warped_LPC_analysis_filter_FIX > > +#endif > + > void silk_warped_LPC_analysis_filter_FIX_sse4_1( > opus_int32 state[], /* I/O State [order + 1] */ > opus_int32 res_Q2[], /* O Residual signal [length] */ > @@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1( > const opus_int order /* I Filter order (even) */ > ); > > +#if defined(OPUS_X86_PRESUME_SSE4_1) > +#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ > + ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) > + > +#else > + > extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])( > opus_int32 state[], /* I/O State [order + 1] */ > opus_int32 res_Q2[], /* O Residual signal [length] */ > @@ -224,5 +270,7 @@ extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1]) > # define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ > ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) > > +#endif > + > # endif > #endif > diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c > index 6747d10..ad9fef2 100644 > --- a/silk/x86/x86_silk_map.c > +++ b/silk/x86/x86_silk_map.c > @@ -35,6 +35,10 @@ > #include "pitch.h" > #include "main.h" > > +#if !defined(OPUS_X86_PRESUME_SSE4_1) > + > +#if defined(FIXED_POINT) > + > opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( > const opus_int16 *inVec1, > const opus_int16 *inVec2, > @@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( > ) = { > silk_inner_prod16_aligned_64_c, /* non-sse */ > silk_inner_prod16_aligned_64_c, > + silk_inner_prod16_aligned_64_c, > MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */ > - NULL > }; > > +#endif > + > opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )( > silk_encoder_state *psEncC, > const opus_int16 pIn[] > ) = { > silk_VAD_GetSA_Q8_c, /* non-sse */ > silk_VAD_GetSA_Q8_c, > + silk_VAD_GetSA_Q8_c, > MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */ > - NULL > }; > > void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( > @@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( > ) = { > silk_NSQ_c, /* non-sse */ > silk_NSQ_c, > + silk_NSQ_c, > MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */ > - NULL > }; > > void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( > @@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( > ) = { > silk_VQ_WMat_EC_c, /* non-sse */ > silk_VQ_WMat_EC_c, > + silk_VQ_WMat_EC_c, > MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */ > - NULL > }; > > void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( > @@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( > ) = { > silk_NSQ_del_dec_c, /* non-sse */ > silk_NSQ_del_dec_c, > + silk_NSQ_del_dec_c, > MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ > - NULL > }; > > +#if defined(FIXED_POINT) > + > void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( > opus_int32 state[], /* I/O State [order + 1] */ > opus_int32 res_Q2[], /* O Residual signal [length] */ > @@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( > ) = { > silk_warped_LPC_analysis_filter_FIX_c, /* non-sse */ > silk_warped_LPC_analysis_filter_FIX_c, > + silk_warped_LPC_analysis_filter_FIX_c, > MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */ > - NULL > }; > > void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( > @@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( > ) = { > silk_burg_modified_c, /* non-sse */ > silk_burg_modified_c, > + silk_burg_modified_c, > MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */ > - NULL > }; > + > +#endif > +#endif > diff --git a/win32/VS2010/celt.vcxproj b/win32/VS2010/celt.vcxproj > index f107fec..e068fbe 100644 > --- a/win32/VS2010/celt.vcxproj > +++ b/win32/VS2010/celt.vcxproj > @@ -37,6 +37,12 @@ > <ClCompile Include="..\..\celt\quant_bands.c" /> > <ClCompile Include="..\..\celt\rate.c" /> > <ClCompile Include="..\..\celt\vq.c" /> > + <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c" /> > + <ClCompile Include="..\..\celt\x86\pitch_sse.c" /> > + <ClCompile Include="..\..\celt\x86\pitch_sse2.c" /> > + <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c" /> > + <ClCompile Include="..\..\celt\x86\x86cpu.c" /> > + <ClCompile Include="..\..\celt\x86\x86_celt_map.c" /> > </ItemGroup> > <ItemGroup> > <ClInclude Include="..\..\celt\arch.h" /> > @@ -67,6 +73,9 @@ > <ClInclude Include="..\..\celt\static_modes_fixed.h" /> > <ClInclude Include="..\..\celt\static_modes_float.h" /> > <ClInclude Include="..\..\celt\vq.h" /> > + <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h" /> > + <ClInclude Include="..\..\celt\x86\pitch_sse.h" /> > + <ClInclude Include="..\..\celt\x86\x86cpu.h" /> > <ClInclude Include="..\..\celt\_kiss_fft_guts.h" /> > </ItemGroup> > <PropertyGroup Label="Globals"> > @@ -141,7 +150,7 @@ > <WarningLevel>Level3</WarningLevel> > <Optimization>Disabled</Optimization> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> > </ClCompile> > <Link> > @@ -168,7 +177,7 @@ > <WarningLevel>Level3</WarningLevel> > <Optimization>Disabled</Optimization> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> > </ClCompile> > <Link> > @@ -196,7 +205,7 @@ > <FunctionLevelLinking>true</FunctionLevelLinking> > <IntrinsicFunctions>true</IntrinsicFunctions> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreaded</RuntimeLibrary> > </ClCompile> > <Link> > @@ -227,7 +236,7 @@ > <FunctionLevelLinking>true</FunctionLevelLinking> > <IntrinsicFunctions>true</IntrinsicFunctions> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreaded</RuntimeLibrary> > </ClCompile> > <Link> > diff --git a/win32/VS2010/celt.vcxproj.filters b/win32/VS2010/celt.vcxproj.filters > index e3a1d97..e9948fa 100644 > --- a/win32/VS2010/celt.vcxproj.filters > +++ b/win32/VS2010/celt.vcxproj.filters > @@ -69,6 +69,24 @@ > <ClCompile Include="..\..\celt\celt.c"> > <Filter>Source Files</Filter> > </ClCompile> > + <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\celt\x86\pitch_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\celt\x86\pitch_sse2.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\celt\x86\x86_celt_map.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\celt\x86\x86cpu.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > </ItemGroup> > <ItemGroup> > <ClInclude Include="..\..\celt\cwrs.h"> > @@ -158,5 +176,14 @@ > <ClInclude Include="..\..\celt\celt_lpc.h"> > <Filter>Header Files</Filter> > </ClInclude> > + <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h"> > + <Filter>Header Files</Filter> > + </ClInclude> > + <ClInclude Include="..\..\celt\x86\pitch_sse.h"> > + <Filter>Header Files</Filter> > + </ClInclude> > + <ClInclude Include="..\..\celt\x86\x86cpu.h"> > + <Filter>Header Files</Filter> > + </ClInclude> > </ItemGroup> > </Project> > \ No newline at end of file > diff --git a/win32/VS2010/silk_common.vcxproj b/win32/VS2010/silk_common.vcxproj > index 9cf5f48..d3d077d 100644 > --- a/win32/VS2010/silk_common.vcxproj > +++ b/win32/VS2010/silk_common.vcxproj > @@ -88,7 +88,7 @@ > <WarningLevel>Level3</WarningLevel> > <Optimization>Disabled</Optimization> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> > </ClCompile> > <Link> > @@ -118,7 +118,7 @@ > <WarningLevel>Level3</WarningLevel> > <Optimization>Disabled</Optimization> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> > </ClCompile> > <Link> > @@ -149,7 +149,7 @@ > <FunctionLevelLinking>true</FunctionLevelLinking> > <IntrinsicFunctions>true</IntrinsicFunctions> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreaded</RuntimeLibrary> > <FloatingPointModel>Fast</FloatingPointModel> > </ClCompile> > @@ -184,7 +184,7 @@ > <FunctionLevelLinking>true</FunctionLevelLinking> > <IntrinsicFunctions>true</IntrinsicFunctions> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreaded</RuntimeLibrary> > <FloatingPointModel>Fast</FloatingPointModel> > </ClCompile> > @@ -212,6 +212,8 @@ > </ItemDefinitionGroup> > <ItemGroup> > <ClInclude Include="..\..\include\opus_types.h" /> > + <ClInclude Include="..\..\silk\x86\main_sse.h" /> > + <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h" /> > <ClInclude Include="..\..\win32\config.h" /> > <ClInclude Include="..\..\silk\control.h" /> > <ClInclude Include="..\..\silk\debug.h" /> > @@ -311,8 +313,13 @@ > <ClCompile Include="..\..\silk\table_LSF_cos.c" /> > <ClCompile Include="..\..\silk\VAD.c" /> > <ClCompile Include="..\..\silk\VQ_WMat_EC.c" /> > + <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c" /> > + <ClCompile Include="..\..\silk\x86\NSQ_sse.c" /> > + <ClCompile Include="..\..\silk\x86\VAD_sse.c" /> > + <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c" /> > + <ClCompile Include="..\..\silk\x86\x86_silk_map.c" /> > </ItemGroup> > <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> > <ImportGroup Label="ExtensionTargets"> > </ImportGroup> > -</Project> > +</Project> > \ No newline at end of file > diff --git a/win32/VS2010/silk_common.vcxproj.filters b/win32/VS2010/silk_common.vcxproj.filters > index 30db48e..341180b 100644 > --- a/win32/VS2010/silk_common.vcxproj.filters > +++ b/win32/VS2010/silk_common.vcxproj.filters > @@ -81,6 +81,12 @@ > <ClInclude Include="..\..\silk\typedef.h"> > <Filter>Header Files</Filter> > </ClInclude> > + <ClInclude Include="..\..\silk\x86\main_sse.h"> > + <Filter>Header Files</Filter> > + </ClInclude> > + <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h"> > + <Filter>Header Files</Filter> > + </ClInclude> > </ItemGroup> > <ItemGroup> > <ClCompile Include="..\..\silk\VQ_WMat_EC.c"> > @@ -311,5 +317,20 @@ > <ClCompile Include="..\..\silk\VAD.c"> > <Filter>Source Files</Filter> > </ClCompile> > + <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\silk\x86\NSQ_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\silk\x86\VAD_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\silk\x86\x86_silk_map.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > </ItemGroup> > -</Project> > +</Project> > \ No newline at end of file > diff --git a/win32/VS2010/silk_fixed.vcxproj b/win32/VS2010/silk_fixed.vcxproj > index 5ea1a91..522101e 100644 > --- a/win32/VS2010/silk_fixed.vcxproj > +++ b/win32/VS2010/silk_fixed.vcxproj > @@ -86,7 +86,7 @@ > <WarningLevel>Level3</WarningLevel> > <Optimization>Disabled</Optimization> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> > </ClCompile> > <Link> > @@ -104,7 +104,7 @@ > <WarningLevel>Level3</WarningLevel> > <Optimization>Disabled</Optimization> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> > </ClCompile> > <Link> > @@ -123,7 +123,7 @@ > <FunctionLevelLinking>true</FunctionLevelLinking> > <IntrinsicFunctions>true</IntrinsicFunctions> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreaded</RuntimeLibrary> > </ClCompile> > <Link> > @@ -145,7 +145,7 @@ > <FunctionLevelLinking>true</FunctionLevelLinking> > <IntrinsicFunctions>true</IntrinsicFunctions> > <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> > - <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories> > + <AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories> > <RuntimeLibrary>MultiThreaded</RuntimeLibrary> > </ClCompile> > <Link> > @@ -191,8 +191,11 @@ > <ClCompile Include="..\..\silk\fixed\solve_LS_FIX.c" /> > <ClCompile Include="..\..\silk\fixed\vector_ops_FIX.c" /> > <ClCompile Include="..\..\silk\fixed\warped_autocorrelation_FIX.c" /> > + <ClCompile Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c" /> > + <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c" /> > + <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c" /> > </ItemGroup> > <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> > <ImportGroup Label="ExtensionTargets"> > </ImportGroup> > -</Project> > +</Project> > \ No newline at end of file > diff --git a/win32/VS2010/silk_fixed.vcxproj.filters b/win32/VS2010/silk_fixed.vcxproj.filters > index 6897930..c2327eb 100644 > --- a/win32/VS2010/silk_fixed.vcxproj.filters > +++ b/win32/VS2010/silk_fixed.vcxproj.filters > @@ -18,16 +18,16 @@ > <ClInclude Include="..\..\win32\config.h"> > <Filter>Header Files</Filter> > </ClInclude> > - <ClInclude Include="main_FIX.h"> > + <ClInclude Include="..\..\include\opus_types.h"> > <Filter>Header Files</Filter> > </ClInclude> > - <ClInclude Include="..\SigProc_FIX.h"> > + <ClInclude Include="..\..\silk\SigProc_FIX.h"> > <Filter>Header Files</Filter> > </ClInclude> > - <ClInclude Include="structs_FIX.h"> > + <ClInclude Include="..\..\silk\fixed\main_FIX.h"> > <Filter>Header Files</Filter> > </ClInclude> > - <ClInclude Include="..\..\include\opus_types.h"> > + <ClInclude Include="..\..\silk\fixed\structs_FIX.h"> > <Filter>Header Files</Filter> > </ClInclude> > </ItemGroup> > @@ -107,5 +107,14 @@ > <ClCompile Include="..\..\silk\fixed\LTP_analysis_filter_FIX.c"> > <Filter>Source Files</Filter> > </ClCompile> > + <ClCompile Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > + <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c"> > + <Filter>Source Files</Filter> > + </ClCompile> > </ItemGroup> > </Project> > \ No newline at end of file > diff --git a/win32/config.h b/win32/config.h > index 46ff699..10fbf33 100644 > --- a/win32/config.h > +++ b/win32/config.h > @@ -35,9 +35,28 @@ POSSIBILITY OF SUCH DAMAGE. > > #define OPUS_BUILD 1 > > -/* Enable SSE functions, if compiled with SSE/SSE2 (note that AMD64 implies SSE2) */ > -#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) > -#define __SSE__ 1 > +#if defined(_M_IX86) || defined(_M_X64) > +/* Can always build with SSE intrinsics (no special compiler flags necessary) */ > +#define OPUS_X86_MAY_HAVE_SSE > +#define OPUS_X86_MAY_HAVE_SSE2 > +#define OPUS_X86_MAY_HAVE_SSE4_1 > + > +/* Presume SSE functions, if compiled with SSE/SSE2/AVX (note that AMD64 implies SSE2, and AVX > + implies SSE4.1) */ > +#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) || defined(__AVX__) > +#define OPUS_X86_PRESUME_SSE 1 > +#endif > +#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(__AVX__) > +#define OPUS_X86_PRESUME_SSE2 1 > +#endif > +#if defined(__AVX__) > +#define OPUS_X86_PRESUME_SSE4_1 1 > +#endif > + > +#if !defined(OPUS_X86_PRESUME_SSE4_1) || !defined(OPUS_X86_PRESUME_SSE2) || !defined(OPUS_X86_PRESUME_SSE) > +#define OPUS_HAVE_RTCD 1 > +#endif > + > #endif > > #include "version.h" > -- > 1.9.1 >
Possibly Parallel Threads
- [RFC PATCH v3] Intrinsics/RTCD related fixes. Mostly x86.
- [RFC PATCH v1 0/4] Enable aarch64 intrinsics/Ne10
- [RFC PATCH v1 0/5] aarch64: celt_pitch_xcorr: Fixed point series
- [RFC PATCH v2]: Ne10 fft fixed and previous 0/8]
- [RFC V3 0/8] Ne10 fft fixed and previous