Yes, Thank you. I'll follow up with the AVX code and tests for pitch code. Radu -----Original Message----- From: opus-bounces at xiph.org [mailto:opus-bounces at xiph.org] On Behalf Of Timothy B. Terriberry Sent: Thursday, November 5, 2015 10:31 AM To: opus at xiph.org Subject: Re: [opus] AVX Optimizations Velea, Radu wrote:> I've created a pull request[1] to enable configuration to search for AVX support and set OPUS_X86_MAY_HAVE_AVX accordingly. > Please review.These commits both look fine. Would you like me to land them? _______________________________________________ opus mailing list opus at xiph.org http://lists.xiph.org/mailman/listinfo/opus
Velea, Radu wrote:> Yes, > > Thank you. I'll follow up with the AVX code and tests for pitch code.Actually, I lied. Because you update opus_select_arch(), you can now return a value for arch (4) that is larger than the maximum we currently support (3). This doesn't actually cause failures, because we mask with OPUS_ARCHMASK, but it does mean that a CPU with AVX will invoke the C versions of every function instead of any potential SSE, SSE2, or SSE4.1 versions. It only works this well because the current maximum arch index happens to be one less than a power of two. Here's a patch that fixes the problem. If this looks okay to you, I'll amend your second commit to include these changes. diff --git a/celt/cpu_support.h b/celt/cpu_support.h index db1cb58..133abbf 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -43,23 +43,24 @@ */ #define OPUS_ARCHMASK 3 #elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) #include "x86/x86cpu.h" -/* We currently support 4 x86 variants: +/* We currently support 5 x86 variants: * arch[0] -> non-sse * arch[1] -> sse * arch[2] -> sse2 * arch[3] -> sse4.1 + * arch[4] -> avx */ -#define OPUS_ARCHMASK 3 +#define OPUS_ARCHMASK 7 int opus_select_arch(void); #else #define OPUS_ARCHMASK 0 static OPUS_INLINE int opus_select_arch(void) { return 0; diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 1ed2acb..8e5e449 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -48,44 +48,47 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( int ord, opus_val16 *mem, int arch ) = { celt_fir_c, /* non-sse */ celt_fir_c, celt_fir_c, MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ + MAY_HAVE_SSE4_1(celt_fir) /* avx */ }; void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len ) = { xcorr_kernel_c, /* non-sse */ xcorr_kernel_c, xcorr_kernel_c, MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ + MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */ }; #endif #if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, int N ) = { celt_inner_prod_c, /* non-sse */ celt_inner_prod_c, MAY_HAVE_SSE2(celt_inner_prod), MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ + MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */ }; #endif # else #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) @@ -94,55 +97,59 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *y, opus_val32 sum[4], int len ) = { xcorr_kernel_c, /* non-sse */ MAY_HAVE_SSE(xcorr_kernel), MAY_HAVE_SSE(xcorr_kernel), MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel) }; opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, int N ) = { celt_inner_prod_c, /* non-sse */ MAY_HAVE_SSE(celt_inner_prod), MAY_HAVE_SSE(celt_inner_prod), MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod) }; void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2 ) = { dual_inner_prod_c, /* non-sse */ MAY_HAVE_SSE(dual_inner_prod), MAY_HAVE_SSE(dual_inner_prod), MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod) }; void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12 ) = { comb_filter_const_c, /* non-sse */ MAY_HAVE_SSE(comb_filter_const), MAY_HAVE_SSE(comb_filter_const), MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const) }; #endif #endif #endif
Sorry. I missed that. Good observation. Please go ahead and correct the patch. Thanks, Radu -----Original Message----- From: opus-bounces at xiph.org [mailto:opus-bounces at xiph.org] On Behalf Of Timothy B. Terriberry Sent: Thursday, November 5, 2015 11:08 AM To: opus at xiph.org Subject: Re: [opus] AVX Optimizations Velea, Radu wrote:> Yes, > > Thank you. I'll follow up with the AVX code and tests for pitch code.Actually, I lied. Because you update opus_select_arch(), you can now return a value for arch (4) that is larger than the maximum we currently support (3). This doesn't actually cause failures, because we mask with OPUS_ARCHMASK, but it does mean that a CPU with AVX will invoke the C versions of every function instead of any potential SSE, SSE2, or SSE4.1 versions. It only works this well because the current maximum arch index happens to be one less than a power of two. Here's a patch that fixes the problem. If this looks okay to you, I'll amend your second commit to include these changes. diff --git a/celt/cpu_support.h b/celt/cpu_support.h index db1cb58..133abbf 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -43,23 +43,24 @@ */ #define OPUS_ARCHMASK 3 #elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) #include "x86/x86cpu.h" -/* We currently support 4 x86 variants: +/* We currently support 5 x86 variants: * arch[0] -> non-sse * arch[1] -> sse * arch[2] -> sse2 * arch[3] -> sse4.1 + * arch[4] -> avx */ -#define OPUS_ARCHMASK 3 +#define OPUS_ARCHMASK 7 int opus_select_arch(void); #else #define OPUS_ARCHMASK 0 static OPUS_INLINE int opus_select_arch(void) { return 0; diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 1ed2acb..8e5e449 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -48,44 +48,47 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( int ord, opus_val16 *mem, int arch ) = { celt_fir_c, /* non-sse */ celt_fir_c, celt_fir_c, MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ + MAY_HAVE_SSE4_1(celt_fir) /* avx */ }; void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len ) = { xcorr_kernel_c, /* non-sse */ xcorr_kernel_c, xcorr_kernel_c, MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ + MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */ }; #endif #if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, int N ) = { celt_inner_prod_c, /* non-sse */ celt_inner_prod_c, MAY_HAVE_SSE2(celt_inner_prod), MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ + MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */ }; #endif # else #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) @@ -94,55 +97,59 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *y, opus_val32 sum[4], int len ) = { xcorr_kernel_c, /* non-sse */ MAY_HAVE_SSE(xcorr_kernel), MAY_HAVE_SSE(xcorr_kernel), MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel) }; opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, int N ) = { celt_inner_prod_c, /* non-sse */ MAY_HAVE_SSE(celt_inner_prod), MAY_HAVE_SSE(celt_inner_prod), MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod) }; void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2 ) = { dual_inner_prod_c, /* non-sse */ MAY_HAVE_SSE(dual_inner_prod), MAY_HAVE_SSE(dual_inner_prod), MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod) }; void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12 ) = { comb_filter_const_c, /* non-sse */ MAY_HAVE_SSE(comb_filter_const), MAY_HAVE_SSE(comb_filter_const), MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const) }; #endif #endif #endif _______________________________________________ opus mailing list opus at xiph.org http://lists.xiph.org/mailman/listinfo/opus