I wrote a proof of concept regarding the cpu capabilities runtime detection and choice of optimized function. I follow design which had been discussed on IRC. Also, i notice a little drawback: we must propagate the arch index through functions which don't have codec state as argument. However, if it's look good, i will continue to implement it. Best regards, -- Aur?lien Zanelli Parrot SA 174, quai de Jemmapes 75010 Paris France -------------- next part -------------- diff --git a/Makefile.am b/Makefile.am index f04e3bc..06d4283 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,7 +5,7 @@ lib_LTLIBRARIES = libopus.la DIST_SUBDIRS = doc -INCLUDES = -I$(top_srcdir)/include -I$(top_srcdir)/celt -I$(top_srcdir)/silk -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed +INCLUDES = -I$(top_srcdir)/include -I$(top_srcdir)/celt -I$(top_srcdir)/silk -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed -I$(top_srcdir)/src include celt_sources.mk include silk_sources.mk diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index d5d2c57..673ab4b 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -33,6 +33,7 @@ #define CELT_DECODER_C +#include "cpu_support.h" #include "os_support.h" #include "mdct.h" #include <math.h> @@ -69,6 +70,7 @@ struct OpusCustomDecoder { int downsample; int start, end; int signalling; + int arch; /* Everything beyond this point gets cleared on a reset */ #define DECODER_RESET_START rng @@ -159,6 +161,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMod st->signalling = 1; st->loss_count = 0; + st->arch = opus_select_arch(); opus_custom_decoder_ctl(st, OPUS_RESET_STATE); @@ -430,7 +433,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R { VARDECL( opus_val16, lp_pitch_buf ); ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 ); - pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C); + pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C, st->arch); pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf, DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX, PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index); @@ -496,7 +499,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT); } /* Compute the excitation for exc_length samples before the loss. */ - celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, + celt_fir[st->arch&OPUS_ARCHMASK](exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem); } diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 26e6ebb..08fddd0 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -33,6 +33,7 @@ #define CELT_ENCODER_C +#include "cpu_support.h" #include "os_support.h" #include "mdct.h" #include <math.h> @@ -75,6 +76,7 @@ struct OpusCustomEncoder { int lsb_depth; int variable_duration; int lfe; + int arch; /* Everything beyond this point gets cleared on a reset */ #define ENCODER_RESET_START rng @@ -196,6 +198,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod st->force_intra = 0; st->complexity = 5; st->lsb_depth=24; + st->arch = opus_select_arch(); opus_custom_encoder_ctl(st, OPUS_RESET_STATE); @@ -1014,7 +1017,7 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, VARDECL(opus_val16, pitch_buf); ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16); - pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC); + pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch); /* Don't search for the fir last 1.5 octave of the range because there's too many false-positives due to short-term correlation */ pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N, diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index c75c25b..1eac65b 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -32,9 +32,21 @@ #include "celt_lpc.h" #include "stack_alloc.h" #include "mathops.h" +#include "cpu_support.h" -#ifdef ARM_HAVE_NEON +#ifdef ARM_ASM #include "celt_lpc_neon.h" +void (* const celt_fir[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *, + opus_val16 *, int, int, opus_val16 *) = { + celt_fir_c, //C + celt_fir_c, //ARMV4 + celt_fir_c, //ARMv5E + celt_fir_c, //ARMv6 + celt_fir_neon //NEON +}; +#else +void (* const celt_fir[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *, + opus_val16 *, int, int, opus_val16 *) = {celt_fir_c}; #endif void _celt_lpc( @@ -91,8 +103,7 @@ int p #endif } -#ifndef OVERRIDE_CELT_FIR -void celt_fir(const opus_val16 *x, +void celt_fir_c(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, int N, @@ -116,7 +127,6 @@ void celt_fir(const opus_val16 *x, y[i] = ROUND16(sum, SIG_SHIFT); } } -#endif void celt_iir(const opus_val32 *x, const opus_val16 *den, @@ -142,7 +152,6 @@ void celt_iir(const opus_val32 *x, } } -#ifndef OVERRIDE_CELT_AUTOCORR void _celt_autocorr( const opus_val16 *x, /* in: [0...n-1] samples x */ opus_val32 *ac, /* out: [0...lag-1] ac values */ @@ -198,4 +207,3 @@ void _celt_autocorr( RESTORE_STACK; } -#endif diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index 2baa77e..dcd9666 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -29,12 +29,16 @@ #define PLC_H #include "arch.h" +#include "cpu_support.h" #define LPC_ORDER 24 +void (* const celt_fir[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val16 *, int, int, opus_val16 *); + void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); -void celt_fir(const opus_val16 *x, +void celt_fir_c(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, int N, diff --git a/celt/celt_lpc_neon.h b/celt/celt_lpc_neon.h index e9f76c6..029ae7b 100644 --- a/celt/celt_lpc_neon.h +++ b/celt/celt_lpc_neon.h @@ -28,9 +28,9 @@ #define CELT_LPC_NEON_H #ifdef FIXED_POINT +#include "stack_alloc.h" +#include "mathops.h" -#ifndef DISABLE_CELT_FIR_NEON -#define OVERRIDE_CELT_FIR /* Optimized FIR filter for order 1 and 4 which are used by opus encoder * FIR calls in pitch.c are hard-coded with 1 and 4 order values * @@ -240,7 +240,7 @@ static void celt_fir4(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, ); } -void celt_fir(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, +void celt_fir_neon(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, int N, int ord, opus_val16 *mem) { int i,j; @@ -269,12 +269,9 @@ void celt_fir(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, break; } } -#endif /* CELT_FIR_NEON */ -#ifndef DISABLE_CELT_AUTOCORR_NEON -#define OVERRIDE_CELT_AUTOCORR -void _celt_autocorr( +void _celt_autocorr_neon( const opus_val16 *x, /* in: [0...n-1] samples x */ opus_val32 *ac, /* out: [0...lag-1] ac values */ const opus_val16 *window, @@ -478,7 +475,6 @@ void _celt_autocorr( RESTORE_STACK; } -#endif /* CELT_AUTOCORR_NEON */ #endif /* FIXED_POINT */ diff --git a/celt/pitch.c b/celt/pitch.c index 800a52a..6850dee 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -109,7 +109,7 @@ void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len, } void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, - int len, int C) + int len, int C, const int arch) { int i; opus_val32 ac[5]; @@ -167,11 +167,11 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp); lpc[i] = MULT16_16_Q15(lpc[i], tmp); } - celt_fir(x_lp, lpc, x_lp, len>>1, 4, mem); + celt_fir[arch&OPUS_ARCHMASK](x_lp, lpc, x_lp, len>>1, 4, mem); mem[0]=0; lpc[0]=QCONST16(.8f,12); - celt_fir(x_lp, lpc, x_lp, len>>1, 1, mem); + celt_fir[arch&OPUS_ARCHMASK](x_lp, lpc, x_lp, len>>1, 1, mem); } diff --git a/celt/pitch.h b/celt/pitch.h index 2757071..ad23aa9 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -37,7 +37,7 @@ #include "modes.h" void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, - int len, int C); + int len, int C, const int arch); void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y, int len, int max_pitch, int *pitch); diff --git a/configure.ac b/configure.ac index ee6df9a..6b9612f 100644 --- a/configure.ac +++ b/configure.ac @@ -167,6 +167,7 @@ if test "x${ac_enable_asm}" = xyes ; then AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"], [asm_optimization="disabled"]) if test "x${asm_optimization}" = "xARM" ; then + AC_DEFINE([ARM_ASM], 1, [Use generic ARM asm optimizations]) AC_DEFINE([ARMv4_ASM], 1, [Use generic ARMv4 asm optimizations]) AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0]) if test "x${ARMv5E_ASM}" = "x1" ; then diff --git a/opus_headers.mk b/opus_headers.mk index 43a978c..2c7c077 100644 --- a/opus_headers.mk +++ b/opus_headers.mk @@ -2,6 +2,7 @@ OPUS_HEAD = \ include/opus.h \ include/opus_multistream.h \ src/opus_private.h \ +src/cpu_support.h \ src/analysis.h \ src/mlp.h \ src/tansig_table.h diff --git a/opus_sources.mk b/opus_sources.mk index e4eeb91..1e9791b 100644 --- a/opus_sources.mk +++ b/opus_sources.mk @@ -4,7 +4,8 @@ src/opus_encoder.c \ src/opus_multistream.c \ src/opus_multistream_encoder.c \ src/opus_multistream_decoder.c \ -src/repacketizer.c +src/repacketizer.c \ +src/armcpu.c OPUS_SOURCES_FLOAT = \ src/analysis.c \ diff --git a/src/armcpu.c b/src/armcpu.c new file mode 100644 index 0000000..10a2905 --- /dev/null +++ b/src/armcpu.c @@ -0,0 +1,160 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from libtheora modified to suit to Opus */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "armcpu.h" + +#if !defined(ARM_ASM) || \ + !defined(ARMv5E_ASM) && !defined(ARMv6_ASM) && \ + !defined(ARM_HAVE_NEON) +opus_uint32 opus_cpu_capa(void) +{ + return 0; +} +#elif defined(_MSC_VER) +/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +# define WIN32_LEAN_AND_MEAN +# define WIN32_EXTRA_LEAN +# include <windows.h> + +opus_uint32 opus_cpu_capa(void){ + opus_uint32 flags; + flags=0; + /*MSVC has no inline __asm support for ARM, but it does let you __emit + * instructions via their assembled hex code. + * All of these instructions should be essentially nops.*/ +# if defined(ARMv5E_ASM) + __try{ + /*PLD [r13]*/ + __emit(0xF5DDF000); + flags|=OPUS_CPU_ARM_EDSP; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# if defined(ARMv6E_ASM) + __try{ + /*SHADD8 r3,r3,r3*/ + __emit(0xE6333F93); + flags|=OPUS_CPU_ARM_MEDIA; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# if defined(ARM_HAVE_NEON) + __try{ + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags|=OPUS_CPU_ARM_NEON; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# endif +# endif +# endif + return flags; +} + +#elif defined(__linux__) +/* Linux based */ +opus_uint32 opus_cpu_capa(void) +{ + opus_uint32 flags = 0; + FILE *cpuinfo; + + /* Reading /proc/self/auxv would be easier, but that doesn't work reliably on + * Android */ + cpuinfo = fopen("/proc/cpuinfo", "r"); + + if(cpuinfo != NULL) + { + /* 512 should be enough for anybody (it's even enough for all the flags that + * x86 has accumulated... so far). */ + char buf[512]; + + while(fgets(buf, 512, cpuinfo) != NULL) + { + /* Search for edsp and neon flag */ + if(memcmp(buf, "Features", 8) == 0) + { + char *p; + p = strstr(buf, " edsp"); + if(p != NULL && (p[5] == ' ' || p[5] == '\n')) + flags |= OPUS_CPU_ARM_EDSP; + + p = strstr(buf, " neon"); + if(p != NULL && (p[5] == ' ' || p[5] == '\n')) + flags |= OPUS_CPU_ARM_NEON; + } + + /* Search for media capabilities (>= ARMv6) */ + if(memcmp(buf, "CPU architecture:", 17) == 0) + { + int version; + version = atoi(buf+17); + + if(version == 4) + flags |= OPUS_CPU_ARM_V4; + + if(version >= 6) + flags |= OPUS_CPU_ARM_MEDIA; + } + } + + fclose(cpuinfo); + } + return flags; +} +#else +/* The feature registers which can tell us what the processor supports are + * accessible in priveleged modes only, so we can't have a general user-space + * detection method like on x86.*/ +# error "Configured to use ARM asm but no CPU detection method available for " \ + "your platform. Reconfigure with --disable-asm (or send patches)." +#endif + +int opus_select_arch(void) +{ + opus_uint32 flags = opus_cpu_capa(); + + if(flags & OPUS_CPU_ARM_NEON) + return 4; + else if(flags & OPUS_CPU_ARM_MEDIA) + return 3; + else if(flags & OPUS_CPU_ARM_EDSP) + return 2; + else if(flags & OPUS_CPU_ARM_V4) + return 1; + else + return 0; +} diff --git a/src/armcpu.h b/src/armcpu.h new file mode 100644 index 0000000..358df84 --- /dev/null +++ b/src/armcpu.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from libtheora modified to suit to Opus */ + +#ifndef ARMCPU_H +#define ARMCPU_H + +#include "opus_types.h" +#include "os_support.h" + +#define OPUS_CPU_ARM_V4 (1) +#define OPUS_CPU_ARM_EDSP (1<<1) +#define OPUS_CPU_ARM_MEDIA (1<<2) +#define OPUS_CPU_ARM_NEON (1<<3) + +opus_uint32 opus_cpu_capa(void); +int opus_select_arch(void); + +#endif diff --git a/src/cpu_support.h b/src/cpu_support.h new file mode 100644 index 0000000..bc3b4a3 --- /dev/null +++ b/src/cpu_support.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CPU_SUPPORT_H +#define CPU_SUPPORT_H + +#ifdef ARM_ASM + +#include "armcpu.h" + +/* We currently support C code and 4 ARM variants: + * arch[0] -> C + * arch[1] -> ARMv4 + * arch[2] -> ARMv5E + * arch[3] -> ARMv6 + * arch[4] -> NEON + */ +#define OPUS_ARCHMASK 4 + +#else +#define OPUS_ARCHMASK 0 +static inline opus_uint32 opus_cpu_capa(void) +{ + return 0; +} + +static inline int opus_select_arch(void) +{ + return 0; +} +#endif + +#endif diff --git a/src/opus_decoder.c b/src/opus_decoder.c index f0b2b6f..6bc7091 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -46,6 +46,7 @@ #include "structs.h" #include "define.h" #include "mathops.h" +#include "cpu_support.h" struct OpusDecoder { int celt_dec_offset; @@ -70,6 +71,7 @@ struct OpusDecoder { #endif opus_uint32 rangeFinal; + int arch; }; #ifdef FIXED_POINT @@ -119,6 +121,7 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels) st->Fs = Fs; st->DecControl.API_sampleRate = st->Fs; st->DecControl.nChannelsAPI = st->channels; + st->arch = opus_select_arch(); /* Reset decoder */ ret = silk_InitDecoder( silk_dec ); diff --git a/src/opus_encoder.c b/src/opus_encoder.c index b6424d6..305fad9 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -40,6 +40,7 @@ #include "arch.h" #include "opus_private.h" #include "os_support.h" +#include "cpu_support.h" #include "analysis.h" #include "mathops.h" #include "tuning_parameters.h" @@ -103,6 +104,7 @@ struct OpusEncoder { int analysis_offset; #endif opus_uint32 rangeFinal; + int arch; }; /* Transition tables for the voice and music. First column is the @@ -184,6 +186,8 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat st->Fs = Fs; + st->arch = opus_select_arch(); + ret = silk_InitEncoder( silk_enc, &st->silk_mode ); if(ret)return OPUS_INTERNAL_ERROR;
Timothy B. Terriberry
2013-May-23 18:34 UTC
[opus] ASM runtime detection and optimizations
Aur?lien Zanelli wrote:> I wrote a proof of concept regarding the cpu capabilities runtime > detection and choice of optimized function. I follow design which had > been discussed on IRC.This is a good start. Review comments inline marked with [TBT].> Also, i notice a little drawback: we must propagate the arch index > through functions which don't have codec state as argument.Yeah, that's pretty much unavoidable. I don't think the overhead will be high, though. - celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, + celt_fir[st->arch&OPUS_ARCHMASK](exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem); [TBT] I think this should be hidden in a macro, e.g., In celt_lpc.h: /*Do we have run-time CPU detection?*/ #if OPUS_HAVE_RTCD extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *,opus_val16 *,int, int, opus_val16 *); # define celt_fir(x, num, y, N, ord, mem, arch) \ ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, mem)) #else # define celt_fir(x, num, y, N, ord, mem, arch) \ (celt_fir_c(x, num, y, N, ord, mem)) #endif Then, we can call celt_fir() like a normal function with an additional arch parameter. That lets us hide the NEON-specific stuff in celt_lpc_neon.h, and override the define to hard-code use of the NEON version if we know we're compiling for a specific CPU, etc. --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -32,9 +32,21 @@ #include "celt_lpc.h" #include "stack_alloc.h" #include "mathops.h" +#include "cpu_support.h" -#ifdef ARM_HAVE_NEON +#ifdef ARM_ASM #include "celt_lpc_neon.h" [TBT] I think this include should happen in celt_lpc.h. [TBT] I think the definition of this array should be in, e.g., a celt_lpc_arm.c under celt/arm. +void (* const celt_fir[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *, + opus_val16 *, int, int, opus_val16 *) = { + celt_fir_c, //C [TBT] We're assuming at least ARMv4 if we have any ARM asm enabled at all, so you don't need a C version. That gets us down to 4 variants on ARM, so there's no wasted array entries. + celt_fir_c, //ARMV4 + celt_fir_c, //ARMv5E + celt_fir_c, //ARMv6 + celt_fir_neon //NEON +}; +#else +void (* const celt_fir[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *, + opus_val16 *, int, int, opus_val16 *) = {celt_fir_c}; [TBT] If we don't have ARM asm, we shouldn't be using RTCD, so (using the #ifdefs I proposed above), we wouldn't need this. #endif void _celt_lpc( @@ -91,8 +103,7 @@ int p #endif } -#ifndef OVERRIDE_CELT_FIR -void celt_fir(const opus_val16 *x, +void celt_fir_c(const opus_val16 *x, const opus_val16 *num, opus_val16 *y, int N, @@ -116,7 +127,6 @@ void celt_fir(const opus_val16 *x, y[i] = ROUND16(sum, SIG_SHIFT); } } -#endif [TBT] BTW, I think it would be better to rebase these patches instead of undoing a bunch of things you did in patch 02. [TBT] celt_lpc_neon.* should go under celt/arm now. diff --git a/celt/celt_lpc_neon.h b/celt/celt_lpc_neon.h index e9f76c6..029ae7b 100644 --- a/celt/celt_lpc_neon.h +++ b/celt/celt_lpc_neon.h [TBT] See above. - celt_fir(x_lp, lpc, x_lp, len>>1, 4, mem); + celt_fir[arch&OPUS_ARCHMASK](x_lp, lpc, x_lp, len>>1, 4, mem); mem[0]=0; lpc[0]=QCONST16(.8f,12); - celt_fir(x_lp, lpc, x_lp, len>>1, 1, mem); + celt_fir[arch&OPUS_ARCHMASK](x_lp, lpc, x_lp, len>>1, 1, mem); } diff --git a/configure.ac b/configure.ac index ee6df9a..6b9612f 100644 --- a/configure.ac +++ b/configure.ac @@ -167,6 +167,7 @@ if test "x${ac_enable_asm}" = xyes ; then AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"], [asm_optimization="disabled"]) if test "x${asm_optimization}" = "xARM" ; then + AC_DEFINE([ARM_ASM], 1, [Use generic ARM asm optimizations]) [TBT] I don't we should distinguish between "generic ARM" and ARMv4, because we don't have any plans to support earlier ARM versions. If you want to rename ARMv4_ASM to ARM_ASM, that's okay, but my opinion was that calling it ARMv4_ASM and not having an ARM_ASM made that assumption unambiguous. Earlier devices (if any) can just disable ARM asm entirely. AC_DEFINE([ARMv4_ASM], 1, [Use generic ARMv4 asm optimizations]) AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0]) if test "x${ARMv5E_ASM}" = "x1" ; then diff --git a/src/armcpu.c b/src/armcpu.c new file mode 100644 index 0000000..10a2905 --- /dev/null +++ b/src/armcpu.c @@ -0,0 +1,160 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from libtheora modified to suit to Opus */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "armcpu.h" + +#if !defined(ARM_ASM) || \ + !defined(ARMv5E_ASM) && !defined(ARMv6_ASM) && \ + !defined(ARM_HAVE_NEON) [TBT] Please don't abbreviate "capabilities". +opus_uint32 opus_cpu_capa(void) +{ + return 0; +} +#elif defined(_MSC_VER) +/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +# define WIN32_LEAN_AND_MEAN +# define WIN32_EXTRA_LEAN +# include <windows.h> + +opus_uint32 opus_cpu_capa(void){ + opus_uint32 flags; + flags=0; + /*MSVC has no inline __asm support for ARM, but it does let you __emit + * instructions via their assembled hex code. + * All of these instructions should be essentially nops.*/ +# if defined(ARMv5E_ASM) + __try{ + /*PLD [r13]*/ + __emit(0xF5DDF000); + flags|=OPUS_CPU_ARM_EDSP; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# if defined(ARMv6E_ASM) + __try{ + /*SHADD8 r3,r3,r3*/ + __emit(0xE6333F93); + flags|=OPUS_CPU_ARM_MEDIA; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# if defined(ARM_HAVE_NEON) + __try{ + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags|=OPUS_CPU_ARM_NEON; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# endif +# endif +# endif + return flags; +} + +#elif defined(__linux__) +/* Linux based */ +opus_uint32 opus_cpu_capa(void) +{ + opus_uint32 flags = 0; + FILE *cpuinfo; + + /* Reading /proc/self/auxv would be easier, but that doesn't work reliably on + * Android */ + cpuinfo = fopen("/proc/cpuinfo", "r"); + + if(cpuinfo != NULL) + { + /* 512 should be enough for anybody (it's even enough for all the flags that + * x86 has accumulated... so far). */ + char buf[512]; + + while(fgets(buf, 512, cpuinfo) != NULL) + { + /* Search for edsp and neon flag */ + if(memcmp(buf, "Features", 8) == 0) + { + char *p; + p = strstr(buf, " edsp"); + if(p != NULL && (p[5] == ' ' || p[5] == '\n')) + flags |= OPUS_CPU_ARM_EDSP; + + p = strstr(buf, " neon"); + if(p != NULL && (p[5] == ' ' || p[5] == '\n')) + flags |= OPUS_CPU_ARM_NEON; + } + + /* Search for media capabilities (>= ARMv6) */ + if(memcmp(buf, "CPU architecture:", 17) == 0) + { + int version; + version = atoi(buf+17); + + if(version == 4) + flags |= OPUS_CPU_ARM_V4; [TBT] The Windows code isn't detecting this, and I don't think we should, either. We should just assume it's true. + + if(version >= 6) + flags |= OPUS_CPU_ARM_MEDIA; + } + } + + fclose(cpuinfo); + } + return flags; +} +#else +/* The feature registers which can tell us what the processor supports are + * accessible in priveleged modes only, so we can't have a general user-space + * detection method like on x86.*/ +# error "Configured to use ARM asm but no CPU detection method available for " \ + "your platform. Reconfigure with --disable-asm (or send patches)." +#endif + [TBT] This flags -> arch conversion should work in the reverse order. I.e., we should check the EDSP, MEDIA, and NEON flags in that order, and the first one we don't have, we should stop and return the prior one as our arch index. +int opus_select_arch(void) +{ + opus_uint32 flags = opus_cpu_capa(); + + if(flags & OPUS_CPU_ARM_NEON) + return 4; + else if(flags & OPUS_CPU_ARM_MEDIA) + return 3; + else if(flags & OPUS_CPU_ARM_EDSP) + return 2; + else if(flags & OPUS_CPU_ARM_V4) + return 1; [TBT] Don't need to test for ARMv4. + else + return 0; +}
On Thu, May 23, 2013 at 1:38 PM, Aur?lien Zanelli <aurelien.zanelli at parrot.com> wrote:> I wrote a proof of concept regarding the cpu capabilities runtime detection > and choice of optimized function. I follow design which had been discussed > on IRC.the pixman library [1] used by X has code for run time detection and fast path optimisations for ARMv5/ARMv7/NEON and even iwmmxt (some marvell platforms) if you want a sample implementation of one way of doing it. [1] http://cgit.freedesktop.org/pixman/tree/> Also, i notice a little drawback: we must propagate the arch index through > functions which don't have codec state as argument. > > However, if it's look good, i will continue to implement it.Peter