Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 10/18] Clean up some intrinsics-related wording in configure.
--- configure.ac | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index f52d2c2..e1a6e9b 100644 --- a/configure.ac +++ b/configure.ac @@ -190,7 +190,7 @@ AC_ARG_ENABLE([rtcd], [enable_rtcd=yes]) AC_ARG_ENABLE([intrinsics], - [AS_HELP_STRING([--disable-intrinsics], [Disable intrinsics optimizations for ARM(float) X86(fixed)])],, + [AS_HELP_STRING([--disable-intrinsics], [Disable intrinsics optimizations])],, [enable_intrinsics=yes]) rtcd_support=no @@ -483,11 +483,11 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"], [ - AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics]) + AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1, [Compiler supports ARMv7/Aarch64 Neon Intrinsics]) intrinsics_support="$intrinsics_support (Neon_Intrinsics)" AS_IF([test x"enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], - [rtcd_support="$rtcd_support (ARMv7_Neon_Intrinsics)"]) + [rtcd_support="$rtcd_support (Neon_Intrinsics)"]) AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" = x"1"], [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1, [Define if binary requires NEON intrinsics support])]) -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 11/18] Move OPUS_FAST_INT64 definition to celt/arch.h.
--- celt/arch.h | 5 +++++ silk/macros.h | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/celt/arch.h b/celt/arch.h index 9f74ddd..670527b 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -78,6 +78,11 @@ static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line) #define UADD32(a,b) ((a)+(b)) #define USUB32(a,b) ((a)-(b)) +/* Set this if opus_int64 is a native type of the CPU. */ +/* Assume that all LP64 architectures have fast 64-bit types; also x86_64 (which can be ILP32 for x32) + and Win64 (which is LLP64). */ +#define OPUS_FAST_INT64 (defined(__LP64__) || defined(__x86_64__) || defined(_WIN64)) + #define PRINT_MIPS(file) #ifdef FIXED_POINT diff --git a/silk/macros.h b/silk/macros.h index bc30303..1ba614a 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -34,6 +34,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "opus_types.h" #include "opus_defines.h" +#include "arch.h" #if OPUS_GNUC_PREREQ(3, 0) #define opus_likely(x) (__builtin_expect(!!(x), 1)) @@ -43,9 +44,6 @@ POSSIBILITY OF SUCH DAMAGE. #define opus_unlikely(x) (!!(x)) #endif -/* Set this if opus_int64 is a native type of the CPU. */ -#define OPUS_FAST_INT64 (defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)) - /* This is an OPUS_INLINE header file for general platform. */ /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 12/18] Add OPUS_FAST_INT64 flavors of celt/fixed_generic.h macros.
--- celt/fixed_generic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index ac67d37..1cfd6d6 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -37,16 +37,32 @@ #define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b)) /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT16_32_Q16(a,b) ((opus_val32)SHR((opus_int64)((opus_val16)(a))*(b),16)) +#else #define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) +#endif /** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT16_32_P16(a,b) ((opus_val32)PSHR((opus_int64)((opus_val16)(a))*(b),16)) +#else #define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) +#endif /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT16_32_Q15(a,b) ((opus_val32)SHR((opus_int64)((opus_val16)(a))*(b),15)) +#else #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) +#endif /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31)) +#else #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15)) +#endif /** Compile-time conversion of float constant to 16-bit value */ #define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits)))) -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 13/18] Explicitly cast results of silk OPUS_FAST_INT64 macros back to opus_int32.
--- silk/macros.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/silk/macros.h b/silk/macros.h index 1ba614a..e1e05b9 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -48,14 +48,14 @@ POSSIBILITY OF SUCH DAMAGE. /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ #if OPUS_FAST_INT64 -#define silk_SMULWB(a32, b32) (((a32) * (opus_int64)((opus_int16)(b32))) >> 16) +#define silk_SMULWB(a32, b32) ((opus_int32)(((a32) * (opus_int64)((opus_int16)(b32))) >> 16)) #else #define silk_SMULWB(a32, b32) ((((a32) >> 16) * (opus_int32)((opus_int16)(b32))) + ((((a32) & 0x0000FFFF) * (opus_int32)((opus_int16)(b32))) >> 16)) #endif /* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */ #if OPUS_FAST_INT64 -#define silk_SMLAWB(a32, b32, c32) ((a32) + (((b32) * (opus_int64)((opus_int16)(c32))) >> 16)) +#define silk_SMLAWB(a32, b32, c32) ((opus_int32)((a32) + (((b32) * (opus_int64)((opus_int16)(c32))) >> 16))) #else #define silk_SMLAWB(a32, b32, c32) ((a32) + ((((b32) >> 16) * (opus_int32)((opus_int16)(c32))) + ((((b32) & 0x0000FFFF) * (opus_int32)((opus_int16)(c32))) >> 16))) #endif @@ -65,7 +65,7 @@ POSSIBILITY OF SUCH DAMAGE. /* a32 + (b32 * (c32 >> 16)) >> 16 */ #if OPUS_FAST_INT64 -#define silk_SMLAWT(a32, b32, c32) ((a32) + (((b32) * ((opus_int64)(c32) >> 16)) >> 16)) +#define silk_SMLAWT(a32, b32, c32) ((opus_int32)((a32) + (((b32) * ((opus_int64)(c32) >> 16)) >> 16))) #else #define silk_SMLAWT(a32, b32, c32) ((a32) + (((b32) >> 16) * ((c32) >> 16)) + ((((b32) & 0x0000FFFF) * ((c32) >> 16)) >> 16)) #endif @@ -87,14 +87,14 @@ POSSIBILITY OF SUCH DAMAGE. /* (a32 * b32) >> 16 */ #if OPUS_FAST_INT64 -#define silk_SMULWW(a32, b32) (((opus_int64)(a32) * (b32)) >> 16) +#define silk_SMULWW(a32, b32) ((opus_int32)(((opus_int64)(a32) * (b32)) >> 16)) #else #define silk_SMULWW(a32, b32) silk_MLA(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16)) #endif /* a32 + ((b32 * c32) >> 16) */ #if OPUS_FAST_INT64 -#define silk_SMLAWW(a32, b32, c32) ((a32) + (((opus_int64)(b32) * (c32)) >> 16)) +#define silk_SMLAWW(a32, b32, c32) ((opus_int32)((a32) + (((opus_int64)(b32) * (c32)) >> 16))) #else #define silk_SMLAWW(a32, b32, c32) silk_MLA(silk_SMLAWB((a32), (b32), (c32)), (b32), silk_RSHIFT_ROUND((c32), 16)) #endif -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 14/18] Add OPUS_FAST_INT64 definition of silk_SMULWT.
--- silk/macros.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/silk/macros.h b/silk/macros.h index e1e05b9..7cefedc 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -61,7 +61,11 @@ POSSIBILITY OF SUCH DAMAGE. #endif /* (a32 * (b32 >> 16)) >> 16 */ +#if OPUS_FAST_INT64 +#define silk_SMULWT(a32, b32) ((opus_int32)(((a32) * (opus_int64)((b32) >> 16)) >> 16)) +#else #define silk_SMULWT(a32, b32) (((a32) >> 16) * ((b32) >> 16) + ((((a32) & 0x0000FFFF) * ((b32) >> 16)) >> 16)) +#endif /* a32 + (b32 * (c32 >> 16)) >> 16 */ #if OPUS_FAST_INT64 -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 15/18] Clean up formatting of configure output for ARM intrinsics detection.
This makes it match the formatting of the output for ARM assembly better, and removes some redundant repetition of the word "intrinsics". It also fixes the output if a compiler supports RTCD for Neon intrinsics but not assembly. --- configure.ac | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/configure.ac b/configure.ac index e1a6e9b..90a06c8 100644 --- a/configure.ac +++ b/configure.ac @@ -484,10 +484,12 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"], [ AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1, [Compiler supports ARMv7/Aarch64 Neon Intrinsics]) - intrinsics_support="$intrinsics_support (Neon_Intrinsics)" + intrinsics_support="$intrinsics_support (NEON)" - AS_IF([test x"enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], - [rtcd_support="$rtcd_support (Neon_Intrinsics)"]) + AS_IF([test x"$enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"], + [AS_IF([test x"$rtcd_support" = "no"], + [rtcd_support="ARM (NEON Intrinsics)"], + [rtcd_support="$rtcd_support (NEON Intrinsics)"])]) AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" = x"1"], [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1, [Define if binary requires NEON intrinsics support])]) @@ -501,12 +503,9 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ [rtcd_support="$rtcd_support (NE10)"]) ]) - AS_IF([test x"$rtcd_support" = x""], - [rtcd_support=no]) - AS_IF([test x"$intrinsics_support" = x""], [intrinsics_support=no], - [intrinsics_support="arm$intrinsics_support"]) + [intrinsics_support="ARM$intrinsics_support"]) ], [ AC_MSG_WARN([Compiler does not support ARM intrinsics]) -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 16/18] Add configure check for Aarch64-specific Neon intrinsics.
--- configure.ac | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/configure.ac b/configure.ac index 90a06c8..adcb969 100644 --- a/configure.ac +++ b/configure.ac @@ -503,6 +503,26 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ [rtcd_support="$rtcd_support (NE10)"]) ]) + OPUS_CHECK_INTRINSICS( + [Aarch64 Neon], + [$ARM_NEON_INTR_CFLAGS], + [OPUS_ARM_MAY_HAVE_AARCH64_NEON_INTR], + [OPUS_ARM_PRESUME_AARCH64_NEON_INTR], + [[#include <arm_neon.h> + ]], + [[ + static int32_t IN; + static int16_t OUT; + OUT = vqmovns_s32(IN); + ]] + ) + + AS_IF([test x"$OPUS_ARM_PRESUME_AARCH64_NEON_INTR" = x"1"], + [ + AC_DEFINE([OPUS_ARM_PRESUME_AARCH64_NEON_INTR], 1, [Define if binary requires Aarch64 Neon Intrinsics]) + intrinsics_support="$intrinsics_support (NEON [Aarch64])" + ]) + AS_IF([test x"$intrinsics_support" = x""], [intrinsics_support=no], [intrinsics_support="ARM$intrinsics_support"]) -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 17/18] Add Aarch64 intrinsics for saturated add/subtract.
--- silk/arm/macros_arm64.h | 39 +++++++++++++++++++++++++++++++++++++++ silk/macros.h | 4 ++++ silk_headers.mk | 1 + 3 files changed, 44 insertions(+) create mode 100644 silk/arm/macros_arm64.h diff --git a/silk/arm/macros_arm64.h b/silk/arm/macros_arm64.h new file mode 100644 index 0000000..ed03041 --- /dev/null +++ b/silk/arm/macros_arm64.h @@ -0,0 +1,39 @@ +/*********************************************************************** +Copyright (C) 2015 Vidyo +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_MACROS_ARM64_H +#define SILK_MACROS_ARM64_H + +#include <arm_neon.h> + +#undef silk_ADD_SAT32 +#define silk_ADD_SAT32(a, b) (vqadds_s32((a), (b))) + +#undef silk_SUB_SAT32 +#define silk_SUB_SAT32(a, b) (vqsubs_s32((a), (b))) + +#endif /* SILK_MACROS_ARM64_H */ diff --git a/silk/macros.h b/silk/macros.h index 7cefedc..d3ca347 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -151,5 +151,9 @@ static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32) #include "arm/macros_armv5e.h" #endif +#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR +#include "arm/macros_arm64.h" +#endif + #endif /* SILK_MACROS_H */ diff --git a/silk_headers.mk b/silk_headers.mk index c74ab81..f8bf1d2 100644 --- a/silk_headers.mk +++ b/silk_headers.mk @@ -24,6 +24,7 @@ silk/SigProc_FIX.h \ silk/x86/SigProc_FIX_sse.h \ silk/arm/macros_armv4.h \ silk/arm/macros_armv5e.h \ +silk/arm/macros_arm64.h \ silk/arm/SigProc_FIX_armv4.h \ silk/arm/SigProc_FIX_armv5e.h \ silk/arm/NSQ_neon.h \ -- 2.4.9 (Apple Git-60)
Jonathan Lennox
2015-Nov-21 04:03 UTC
[opus] [Aarch64 v2 18/18] Add Aarch64 intrinsic for SIG2WORD16.
--- celt/arch.h | 4 +++- celt/arm/fixed_arm64.h | 35 +++++++++++++++++++++++++++++++++++ celt_headers.mk | 1 + 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 celt/arm/fixed_arm64.h diff --git a/celt/arch.h b/celt/arch.h index 670527b..9a06359 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -123,7 +123,9 @@ static OPUS_INLINE opus_int16 SAT16(opus_int32 x) { #include "fixed_generic.h" -#ifdef OPUS_ARM_INLINE_EDSP +#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR +#include "arm/fixed_arm64.h" +#elif OPUS_ARM_INLINE_EDSP #include "arm/fixed_armv5e.h" #elif defined (OPUS_ARM_INLINE_ASM) #include "arm/fixed_armv4.h" diff --git a/celt/arm/fixed_arm64.h b/celt/arm/fixed_arm64.h new file mode 100644 index 0000000..c6fbd3d --- /dev/null +++ b/celt/arm/fixed_arm64.h @@ -0,0 +1,35 @@ +/* Copyright (C) 2015 Vidyo */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef FIXED_ARM64_H +#define FIXED_ARM64_H + +#include <arm_neon.h> + +#undef SIG2WORD16 +#define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT))) + +#endif diff --git a/celt_headers.mk b/celt_headers.mk index 0eca6e6..c9df94b 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -36,6 +36,7 @@ celt/static_modes_fixed_arm_ne10.h \ celt/arm/armcpu.h \ celt/arm/fixed_armv4.h \ celt/arm/fixed_armv5e.h \ +celt/arm/fixed_arm64.h \ celt/arm/kiss_fft_armv4.h \ celt/arm/kiss_fft_armv5e.h \ celt/arm/pitch_arm.h \ -- 2.4.9 (Apple Git-60)
Possibly Parallel Threads
- [Fast Int64 1/4] Move OPUS_FAST_INT64 definition to celt/arch.h.
- [PATCH 1/3] Add configure check for Aarch64-specific Neon intrinsics.
- [Aarch64 00/11] Patches to enable Aarch64 (arm64) optimizations, rebased to current master.
- [PATCH 0/8] Patches for arm64 (aarch64) support
- Patch cleaning up Opus x86 intrinsics configury