Hi Team,
Can anyone take a look? We are already using this optimization in
production.
Best regards,
Zheng
On Thu, 15 Sept 2022 at 19:04, Zheng Lv <lvzheng at google.com> wrote:
> This makes kiss_twiddle_cpx 4-byte aligned (instead of 2-byte) for
> fixed-point builds. Tested with an armv6j+nofp development board, CELT
> encoding becomes 1.4x as fast, and decoding over 2x.
>
> Performance gain is mostly attributed to the proper alignment of the
> static const array mdct_twiddles960.
>
> Co-authored-by: David Gao <davidgao at google.com>
> ---
> celt/kiss_fft.h | 12 +++++++++++-
> 1 file changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
> index bffa2bfa..267f72f9 100644
> --- a/celt/kiss_fft.h
> +++ b/celt/kiss_fft.h
> @@ -49,31 +49,41 @@ extern "C" {
> #ifdef FIXED_POINT
> #include "arch.h"
>
> # define kiss_fft_scalar opus_int32
> # define kiss_twiddle_scalar opus_int16
>
> +/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single
> memory
> + * access, and could benefit from additional alignment.
> + */
> +# define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32))
>
> #else
> # ifndef kiss_fft_scalar
> /* default is float */
> # define kiss_fft_scalar float
> # define kiss_twiddle_scalar float
> # define KF_SUFFIX _celt_single
> # endif
> #endif
>
> +#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT)
> +#define KISS_TWIDDLE_CPX_ALIGNED
> __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT)))
> +#else
> +#define KISS_TWIDDLE_CPX_ALIGNED
> +#endif
> +
> typedef struct {
> kiss_fft_scalar r;
> kiss_fft_scalar i;
> }kiss_fft_cpx;
>
> typedef struct {
> kiss_twiddle_scalar r;
> kiss_twiddle_scalar i;
> -}kiss_twiddle_cpx;
> +} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx;
>
> #define MAXFACTORS 8
> /* e.g. an fft of length 128 has 4 factors
> as far as kissfft is concerned
> 4*4*4*2
> */
> --
> 2.37.2.789.g6183377224-goog
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.xiph.org/pipermail/opus/attachments/20221024/5e831677/attachment.htm>