Viswanath Puttagunta
2014-Sep-10 19:15 UTC
[Vorbis-dev] [RFC PATCH v1 0/3] Introducing ARM SIMD Support
libvorbis does not currently have any simd/vectorization. Following patches add generic framework for simd/vectorization and on top, add ARM-NEON simd vectorization using intrinsics. I was able to get over 34% performance improvement on my Beaglebone Black which is single Cortex-A8 based CPU. You can find more information on metrics and procedure I used to measure at https://wiki.linaro.org/WorkingGroups/Middleware/Graphics/MediaLibs/libvorbis As described in above link/doc, you can see my work-in-progress tree at https://git.linaro.org/people/viswanath.puttagunta/vorbis.git Also what standard should one follow when submitting patches to libvorbis? I'm pretty sure linux kernel checkpatch.pl will fail all patches for libvorbis. Please advise. Viswanath Puttagunta (3): lib/simd: Introduce vectorization framework for libvorbis lib/block.c: Use optimized routine for wave_operation mdct: implement arm simd implementation for mdct configure.ac | 5 ++ lib/Makefile.am | 6 +- lib/block.c | 14 ++--- lib/mdct.c | 102 +------------------------------- lib/simd/Makefile.am | 9 +++ lib/simd/neon_simd.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/simd/no_simd.c | 129 +++++++++++++++++++++++++++++++++++++++++ lib/simd/simd.h | 40 +++++++++++++ 8 files changed, 351 insertions(+), 112 deletions(-) create mode 100644 lib/simd/Makefile.am create mode 100644 lib/simd/neon_simd.c create mode 100644 lib/simd/no_simd.c create mode 100644 lib/simd/simd.h -- 1.7.9.5
Viswanath Puttagunta
2014-Sep-10 19:15 UTC
[Vorbis-dev] [RFC PATCH v1 1/3] lib/simd: Introduce vectorization framework for libvorbis
Many CPUs have vectorization support that can greatly increase the performance of certain signal processing functions. Introduces generic vectorization framework into libvorbis and some ARM NEON implementations. Signed-off-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org> --- configure.ac | 5 +++++ lib/Makefile.am | 6 +++--- lib/simd/Makefile.am | 9 +++++++++ lib/simd/neon_simd.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ lib/simd/no_simd.c | 28 ++++++++++++++++++++++++++++ lib/simd/simd.h | 29 +++++++++++++++++++++++++++++ 6 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 lib/simd/Makefile.am create mode 100644 lib/simd/neon_simd.c create mode 100644 lib/simd/no_simd.c create mode 100644 lib/simd/simd.h diff --git a/configure.ac b/configure.ac index ee2db99..28b4aaa 100644 --- a/configure.ac +++ b/configure.ac @@ -93,6 +93,10 @@ AC_ARG_ENABLE(examples, AM_CONDITIONAL(BUILD_EXAMPLES, [test "x$enable_examples" = xyes]) +AC_ARG_ENABLE([arm-neon], + [enables arm_neon],[arm_neon=${enableval}],[arm_neon=no]) +AM_CONDITIONAL([ARM_NEON], [test x$arm_neon = xyes]) + dnl -------------------------------------------------- dnl Set build flags based on environment dnl -------------------------------------------------- @@ -275,6 +279,7 @@ AC_CONFIG_FILES([ Makefile m4/Makefile lib/Makefile +lib/simd/Makefile lib/modes/Makefile lib/books/Makefile lib/books/coupled/Makefile diff --git a/lib/Makefile.am b/lib/Makefile.am index 50f7ea4..0727398 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1,8 +1,8 @@ ## Process this file with automake to produce Makefile.in -SUBDIRS = modes books +SUBDIRS = simd modes books -INCLUDES = -I$(top_srcdir)/include @OGG_CFLAGS@ +INCLUDES = -I$(top_srcdir)/include @OGG_CFLAGS@ -I$(top_srcdir)/lib/simd lib_LTLIBRARIES = libvorbis.la libvorbisfile.la libvorbisenc.la @@ -16,7 +16,7 @@ libvorbis_la_SOURCES = mdct.c smallft.c block.c envelope.c window.c lsp.c \ registry.h scales.h window.h lookup.h lookup_data.h\ codec_internal.h backends.h bitrate.h libvorbis_la_LDFLAGS = -no-undefined -version-info @V_LIB_CURRENT@:@V_LIB_REVISION@:@V_LIB_AGE@ -libvorbis_la_LIBADD = @VORBIS_LIBS@ @OGG_LIBS@ +libvorbis_la_LIBADD = @VORBIS_LIBS@ @OGG_LIBS@ ./simd/libsimd.la libvorbisfile_la_SOURCES = vorbisfile.c libvorbisfile_la_LDFLAGS = -no-undefined -version-info @VF_LIB_CURRENT@:@VF_LIB_REVISION@:@VF_LIB_AGE@ diff --git a/lib/simd/Makefile.am b/lib/simd/Makefile.am new file mode 100644 index 0000000..7225431 --- /dev/null +++ b/lib/simd/Makefile.am @@ -0,0 +1,9 @@ +INCLUDES = -I$(top_srcdir)/include -I$(top_srcdir)/lib @OGG_CFLAGS@ +noinst_LTLIBRARIES = libsimd.la + +if ARM_NEON +libsimd_la_CPPFLAGS = -mfpu=neon-vfpv4 -O3 +libsimd_la_SOURCES = simd.h neon_simd.c +else +libsimd_la_SOURCES = simd.h no_simd.c +endif diff --git a/lib/simd/neon_simd.c b/lib/simd/neon_simd.c new file mode 100644 index 0000000..381d704 --- /dev/null +++ b/lib/simd/neon_simd.c @@ -0,0 +1,47 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ********************************************************************/ + +/* Optimized functions using ARM NEON */ + +#include <stdint.h> +#include <arm_neon.h> +#include "simd.h" + +/* wave_operation: Implements pseudo-code + * for(i = 0; i < n; i++) + * a[i] = a[i]*b[n-i-1] + a[i]*c[i] + * Assumes n is multiple of 4 + */ +void wave_operation(float *a, float *b, float *c, int32_t n) { + float32x4_t result, tmpa, tmpb, tmpc, tmpd; + float *ai, *bi, *ci, *di; + float32x2_t vec64l, vec64h; + + for (ai = a, bi = b+n-4, ci=c, di=b; \ + ai < (a+n); ai += 4, bi-=4, ci+=4, di+=4) { + tmpa = vld1q_f32(ai); + tmpb = vld1q_f32(bi); + __builtin_prefetch(ai); + vec64l = vget_low_f32(tmpb); + vec64h = vget_high_f32(tmpb); + tmpb = vcombine_f32(vec64h, vec64l); + tmpb = vrev64q_f32(tmpb); + tmpc = vld1q_f32(ci); + tmpd = vld1q_f32(di); + __builtin_prefetch(ci); + + result = vmulq_f32(tmpa, tmpb); + result = vmlaq_f32(result, tmpc, tmpd); + + vst1q_f32(ai, result); + } +} diff --git a/lib/simd/no_simd.c b/lib/simd/no_simd.c new file mode 100644 index 0000000..e8efacb --- /dev/null +++ b/lib/simd/no_simd.c @@ -0,0 +1,28 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ********************************************************************/ + +/* Implementation when CPU Vectorization is not available */ + +#include <stdint.h> +#include "simd.h" +#include "os.h" + +/* wave_operation: Implements pseudo-code + * for(i = 0; i < n; i++) + * a[i] = a[i]*b[n-i-1] + a[i]*c[i] + */ +void wave_operation(float *a, float *b, float *c, int32_t n) { + int32_t i; + + for (i = 0; i < n; i++) + a[i] = a[i]*b[n-i-1] + b[i]*c[i]; +} diff --git a/lib/simd/simd.h b/lib/simd/simd.h new file mode 100644 index 0000000..8565434 --- /dev/null +++ b/lib/simd/simd.h @@ -0,0 +1,29 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: Declarations for functions that can be optimized with + CPU vectorization if available + ********************************************************************/ + +#ifndef SIMD_H +#define SIMD_H + +#include "os.h" + +/* wave_operation: Must implement pseudo-code + * for(i = 0; i < n; i++) + * a[i] = a[i]*b[n-i-1] + b[i]*c[i] + * n must be multiple of 4 + */ +void wave_operation(float *a, float *b, float *c, int32_t n); + +#endif -- 1.7.9.5
Viswanath Puttagunta
2014-Sep-10 19:15 UTC
[Vorbis-dev] [RFC PATCH v1 2/3] lib/block.c: Use optimized routine for wave_operation
Signed-off-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org> --- lib/block.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/block.c b/lib/block.c index dfcd843..0c3f781 100644 --- a/lib/block.c +++ b/lib/block.c @@ -30,6 +30,7 @@ #include "lpc.h" #include "registry.h" #include "misc.h" +#include "simd.h" static int ilog2(unsigned int v){ int ret=0; @@ -774,15 +775,13 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ const float *w=_vorbis_window_get(b->window[1]-hs); float *pcm=v->pcm[j]+prevCenter; float *p=vb->pcm[j]; - for(i=0;i<n1;i++) - pcm[i]=pcm[i]*w[n1-i-1] + p[i]*w[i]; + wave_operation(&pcm[0], &w[0], &p[0], n1); }else{ /* large/small */ const float *w=_vorbis_window_get(b->window[0]-hs); float *pcm=v->pcm[j]+prevCenter+n1/2-n0/2; float *p=vb->pcm[j]; - for(i=0;i<n0;i++) - pcm[i]=pcm[i]*w[n0-i-1] +p[i]*w[i]; + wave_operation(&pcm[0], &w[0], &p[0], n0); } }else{ if(v->W){ @@ -790,8 +789,8 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ const float *w=_vorbis_window_get(b->window[0]-hs); float *pcm=v->pcm[j]+prevCenter; float *p=vb->pcm[j]+n1/2-n0/2; - for(i=0;i<n0;i++) - pcm[i]=pcm[i]*w[n0-i-1] +p[i]*w[i]; + wave_operation(&pcm[0], &w[0], &p[0], n0); + i = n0; for(;i<n1/2+n0/2;i++) pcm[i]=p[i]; }else{ @@ -799,8 +798,7 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ const float *w=_vorbis_window_get(b->window[0]-hs); float *pcm=v->pcm[j]+prevCenter; float *p=vb->pcm[j]; - for(i=0;i<n0;i++) - pcm[i]=pcm[i]*w[n0-i-1] +p[i]*w[i]; + wave_operation(&pcm[0], &w[0], &p[0], n0); } } -- 1.7.9.5
Viswanath Puttagunta
2014-Sep-10 19:15 UTC
[Vorbis-dev] [RFC PATCH v1 3/3] mdct: implement arm simd implementation for mdct
Signed-off-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org> --- lib/mdct.c | 102 +--------------------------------------------- lib/simd/neon_simd.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/simd/no_simd.c | 101 +++++++++++++++++++++++++++++++++++++++++++++ lib/simd/simd.h | 11 +++++ 4 files changed, 224 insertions(+), 101 deletions(-) diff --git a/lib/mdct.c b/lib/mdct.c index fbc7cf0..633fc49 100644 --- a/lib/mdct.c +++ b/lib/mdct.c @@ -45,6 +45,7 @@ #include "mdct.h" #include "os.h" #include "misc.h" +#include "simd.h" /* build lookups for trig functions; also pre-figure scaling and some window function algebra. */ @@ -213,107 +214,6 @@ STIN void mdct_butterfly_32(DATA_TYPE *x){ } -/* N point first stage butterfly (in place, 2 register) */ -STIN void mdct_butterfly_first(DATA_TYPE *T, - DATA_TYPE *x, - int points){ - - DATA_TYPE *x1 = x + points - 8; - DATA_TYPE *x2 = x + (points>>1) - 8; - REG_TYPE r0; - REG_TYPE r1; - - do{ - - r0 = x1[6] - x2[6]; - r1 = x1[7] - x2[7]; - x1[6] += x2[6]; - x1[7] += x2[7]; - x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]); - x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]); - - r0 = x1[4] - x2[4]; - r1 = x1[5] - x2[5]; - x1[4] += x2[4]; - x1[5] += x2[5]; - x2[4] = MULT_NORM(r1 * T[5] + r0 * T[4]); - x2[5] = MULT_NORM(r1 * T[4] - r0 * T[5]); - - r0 = x1[2] - x2[2]; - r1 = x1[3] - x2[3]; - x1[2] += x2[2]; - x1[3] += x2[3]; - x2[2] = MULT_NORM(r1 * T[9] + r0 * T[8]); - x2[3] = MULT_NORM(r1 * T[8] - r0 * T[9]); - - r0 = x1[0] - x2[0]; - r1 = x1[1] - x2[1]; - x1[0] += x2[0]; - x1[1] += x2[1]; - x2[0] = MULT_NORM(r1 * T[13] + r0 * T[12]); - x2[1] = MULT_NORM(r1 * T[12] - r0 * T[13]); - - x1-=8; - x2-=8; - T+=16; - - }while(x2>=x); -} - -/* N/stage point generic N stage butterfly (in place, 2 register) */ -STIN void mdct_butterfly_generic(DATA_TYPE *T, - DATA_TYPE *x, - int points, - int trigint){ - - DATA_TYPE *x1 = x + points - 8; - DATA_TYPE *x2 = x + (points>>1) - 8; - REG_TYPE r0; - REG_TYPE r1; - - do{ - - r0 = x1[6] - x2[6]; - r1 = x1[7] - x2[7]; - x1[6] += x2[6]; - x1[7] += x2[7]; - x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]); - x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]); - - T+=trigint; - - r0 = x1[4] - x2[4]; - r1 = x1[5] - x2[5]; - x1[4] += x2[4]; - x1[5] += x2[5]; - x2[4] = MULT_NORM(r1 * T[1] + r0 * T[0]); - x2[5] = MULT_NORM(r1 * T[0] - r0 * T[1]); - - T+=trigint; - - r0 = x1[2] - x2[2]; - r1 = x1[3] - x2[3]; - x1[2] += x2[2]; - x1[3] += x2[3]; - x2[2] = MULT_NORM(r1 * T[1] + r0 * T[0]); - x2[3] = MULT_NORM(r1 * T[0] - r0 * T[1]); - - T+=trigint; - - r0 = x1[0] - x2[0]; - r1 = x1[1] - x2[1]; - x1[0] += x2[0]; - x1[1] += x2[1]; - x2[0] = MULT_NORM(r1 * T[1] + r0 * T[0]); - x2[1] = MULT_NORM(r1 * T[0] - r0 * T[1]); - - T+=trigint; - x1-=8; - x2-=8; - - }while(x2>=x); -} - STIN void mdct_butterflies(mdct_lookup *init, DATA_TYPE *x, int points){ diff --git a/lib/simd/neon_simd.c b/lib/simd/neon_simd.c index 381d704..ab55205 100644 --- a/lib/simd/neon_simd.c +++ b/lib/simd/neon_simd.c @@ -16,6 +16,45 @@ #include <arm_neon.h> #include "simd.h" +#ifdef MDCT_INTEGERIZED + +#define DATA_TYPE32x4x2_t int32x4x2_t +#define DATA_TYPE32x4_t int32x4_t +#define DATA_TYPE32x2_t int32x2_t +#define ONES_MINUS_ONE 0x00000001FFFFFFFF // {1, -1} +#define vcreate_DATA_TYPE(a) vcreate_i32(a) +#define vcombine_DATA_TYPE(a,b) vcombine_i32(a,b) +#define vld2q_DATA_TYPE(a) vld2q_i32(a) +#define vsubq_DATA_TYPE(a,b) vsubq_i32(a,b) +#define vaddq_DATA_TYPE(a,b) vaddq_i32(a,b) +#define vst2q_DATA_TYPE(a,b) vst2q_i32(a,b) +#define vld1_DATA_TYPE(a) vld1_i32(a) +#define vrev64_DATA_TYPE(a) vrev64_i32(a) +#define vmul_n_DATA_TYPE(a,b) vmul_n_i32(a,b) +#define vmulq_DATA_TYPE(a,b) vmulq_i32(a,b) +#define vst1q_DATA_TYPE(a,b) vst1q_i32(a,b) + +#else + +#define DATA_TYPE32x4x2_t float32x4x2_t +#define DATA_TYPE32x4_t float32x4_t +#define DATA_TYPE32x2_t float32x2_t +#define ONES_MINUS_ONE 0xbf8000003f800000 //{-1.0, 1.0} +#define vcreate_DATA_TYPE(a) vcreate_f32(a) +#define vcombine_DATA_TYPE(a,b) vcombine_f32(a,b) +#define vld2q_DATA_TYPE(a) vld2q_f32(a) +#define vsubq_DATA_TYPE(a,b) vsubq_f32(a,b) +#define vaddq_DATA_TYPE(a,b) vaddq_f32(a,b) +#define vst2q_DATA_TYPE(a,b) vst2q_f32(a,b) +#define vld1_DATA_TYPE(a) vld1_f32(a) +#define vrev64_DATA_TYPE(a) vrev64_f32(a) +#define vmul_n_DATA_TYPE(a,b) vmul_n_f32(a,b) +#define vmulq_DATA_TYPE(a,b) vmulq_f32(a,b) +#define vst1q_DATA_TYPE(a,b) vst1q_f32(a,b) +#define vrev64q_DATA_TYPE(a,b) vrev64q_f32(a,b) + +#endif + /* wave_operation: Implements pseudo-code * for(i = 0; i < n; i++) * a[i] = a[i]*b[n-i-1] + a[i]*c[i] @@ -45,3 +84,75 @@ void wave_operation(float *a, float *b, float *c, int32_t n) { vst1q_f32(ai, result); } } + +void mdct_butterfly_generic(DATA_TYPE *T, DATA_TYPE *x, + int points, int trigint) { + DATA_TYPE *x1 = x + points - 8; + DATA_TYPE *x2 = x + (points>>1) - 8; + DATA_TYPE32x4x2_t k1, k2; + DATA_TYPE32x4_t r0, r1; + DATA_TYPE32x4_t TTr0[2], TTir1[2]; + DATA_TYPE32x2_t Tr0[4], Tir1[4]; + DATA_TYPE32x2_t dones = vcreate_DATA_TYPE(ONES_MINUS_ONE); + DATA_TYPE32x4_t ones = vcombine_DATA_TYPE(dones, dones); + + do{ + k1 = vld2q_DATA_TYPE(x1); + k2 = vld2q_DATA_TYPE(x2); + + r0 = vsubq_DATA_TYPE(k1.val[0], k2.val[0]); + r1 = vsubq_DATA_TYPE(k1.val[1], k2.val[1]); + + k1.val[0] = vaddq_DATA_TYPE(k1.val[0], k2.val[0]); + k1.val[1] = vaddq_DATA_TYPE(k1.val[1], k2.val[1]); + vst2q_DATA_TYPE(x1, k1); + + Tr0[0] = vld1_DATA_TYPE(T); + T += trigint; + + Tr0[1] = vld1_DATA_TYPE(T); + T += trigint; + + Tr0[2] = vld1_DATA_TYPE(T); + T += trigint; + + Tr0[3] = vld1_DATA_TYPE(T); + T += trigint; + + Tir1[0] = vrev64_DATA_TYPE(Tr0[0]); + Tir1[1] = vrev64_DATA_TYPE(Tr0[1]); + Tir1[2] = vrev64_DATA_TYPE(Tr0[2]); + Tir1[3] = vrev64_DATA_TYPE(Tr0[3]); + + Tr0[0] = vmul_n_DATA_TYPE(Tr0[0], r0[3]); + Tr0[1] = vmul_n_DATA_TYPE(Tr0[1], r0[2]); + Tr0[2] = vmul_n_DATA_TYPE(Tr0[2], r0[1]); + Tr0[3] = vmul_n_DATA_TYPE(Tr0[3], r0[0]); + + Tir1[0] = vmul_n_DATA_TYPE(Tir1[0], r1[3]); + Tir1[1] = vmul_n_DATA_TYPE(Tir1[1], r1[2]); + Tir1[2] = vmul_n_DATA_TYPE(Tir1[2], r1[1]); + Tir1[3] = vmul_n_DATA_TYPE(Tir1[3], r1[0]); + + TTr0[0] = vcombine_DATA_TYPE(Tr0[3], Tr0[2]); + TTr0[0] = vmulq_DATA_TYPE(TTr0[0], ones); + TTr0[1] = vcombine_DATA_TYPE(Tr0[1], Tr0[0]); + TTr0[1] = vmulq_DATA_TYPE(TTr0[1], ones); + + TTir1[0] = vcombine_DATA_TYPE(Tir1[3], Tir1[2]); + TTir1[1] = vcombine_DATA_TYPE(Tir1[1], Tir1[0]); + k2.val[0] = vaddq_DATA_TYPE(TTr0[0], TTir1[0]); + k2.val[1] = vaddq_DATA_TYPE(TTr0[1], TTir1[1]); + + vst1q_DATA_TYPE(x2, k2.val[0]); + vst1q_DATA_TYPE(x2+4, k2.val[1]); + + x1 -= 8; + x2 -= 8; + }while(x2 >= x); +} + +void mdct_butterfly_first(DATA_TYPE *T, DATA_TYPE *x, int points) +{ + mdct_butterfly_generic(T, x, points, 4); +} diff --git a/lib/simd/no_simd.c b/lib/simd/no_simd.c index e8efacb..a2cc6c4 100644 --- a/lib/simd/no_simd.c +++ b/lib/simd/no_simd.c @@ -26,3 +26,104 @@ void wave_operation(float *a, float *b, float *c, int32_t n) { for (i = 0; i < n; i++) a[i] = a[i]*b[n-i-1] + b[i]*c[i]; } + +/* N/stage point generic N stage butterfly (in place, 2 register) */ +void mdct_butterfly_generic(DATA_TYPE *T, + DATA_TYPE *x, + int points, + int trigint){ + + DATA_TYPE *x1 = x + points - 8; + DATA_TYPE *x2 = x + (points>>1) - 8; + REG_TYPE r0; + REG_TYPE r1; + + do{ + + r0 = x1[6] - x2[6]; + r1 = x1[7] - x2[7]; + x1[6] += x2[6]; + x1[7] += x2[7]; + x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]); + + T+=trigint; + + r0 = x1[4] - x2[4]; + r1 = x1[5] - x2[5]; + x1[4] += x2[4]; + x1[5] += x2[5]; + x2[4] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[5] = MULT_NORM(r1 * T[0] - r0 * T[1]); + + T+=trigint; + + r0 = x1[2] - x2[2]; + r1 = x1[3] - x2[3]; + x1[2] += x2[2]; + x1[3] += x2[3]; + x2[2] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[3] = MULT_NORM(r1 * T[0] - r0 * T[1]); + + T+=trigint; + + r0 = x1[0] - x2[0]; + r1 = x1[1] - x2[1]; + x1[0] += x2[0]; + x1[1] += x2[1]; + x2[0] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[1] = MULT_NORM(r1 * T[0] - r0 * T[1]); + + T+=trigint; + x1-=8; + x2-=8; + + }while(x2>=x); +} + +/* N point first stage butterfly (in place, 2 register) */ +void mdct_butterfly_first(DATA_TYPE *T, + DATA_TYPE *x, + int points){ + + DATA_TYPE *x1 = x + points - 8; + DATA_TYPE *x2 = x + (points>>1) - 8; + REG_TYPE r0; + REG_TYPE r1; + + do{ + + r0 = x1[6] - x2[6]; + r1 = x1[7] - x2[7]; + x1[6] += x2[6]; + x1[7] += x2[7]; + x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]); + + r0 = x1[4] - x2[4]; + r1 = x1[5] - x2[5]; + x1[4] += x2[4]; + x1[5] += x2[5]; + x2[4] = MULT_NORM(r1 * T[5] + r0 * T[4]); + x2[5] = MULT_NORM(r1 * T[4] - r0 * T[5]); + + r0 = x1[2] - x2[2]; + r1 = x1[3] - x2[3]; + x1[2] += x2[2]; + x1[3] += x2[3]; + x2[2] = MULT_NORM(r1 * T[9] + r0 * T[8]); + x2[3] = MULT_NORM(r1 * T[8] - r0 * T[9]); + + r0 = x1[0] - x2[0]; + r1 = x1[1] - x2[1]; + x1[0] += x2[0]; + x1[1] += x2[1]; + x2[0] = MULT_NORM(r1 * T[13] + r0 * T[12]); + x2[1] = MULT_NORM(r1 * T[12] - r0 * T[13]); + + x1-=8; + x2-=8; + T+=16; + + }while(x2>=x); +} diff --git a/lib/simd/simd.h b/lib/simd/simd.h index 8565434..a9e6ccf 100644 --- a/lib/simd/simd.h +++ b/lib/simd/simd.h @@ -18,6 +18,7 @@ #define SIMD_H #include "os.h" +#include "mdct.h" /* wave_operation: Must implement pseudo-code * for(i = 0; i < n; i++) @@ -26,4 +27,14 @@ */ void wave_operation(float *a, float *b, float *c, int32_t n); +/* N/stage point generic N stage butterfly (in place, 2 register) */ +void mdct_butterfly_generic(DATA_TYPE *T, + DATA_TYPE *x, + int points, + int trigint); + +/* N point first stage butterfly (in place, 2 register) */ +void mdct_butterfly_first(DATA_TYPE *T, + DATA_TYPE *x, + int points); #endif -- 1.7.9.5
Viswanath Puttagunta
2014-Sep-23 20:50 UTC
[Vorbis-dev] [RFC PATCH v1 0/3] Introducing ARM SIMD Support
Hello Vorbis Developers / Monty, Am I at the right place? I posted these patches about 2 weeks ago to " vorbis-dev at xiph.org" I haven't seen any response/reviews about this patch. Neither do I see any other emails from this mailing list. Please advise. Regards, Vish (Viswanath Puttagunta) Cell: 972-342-0205 Technical Program Manager Member Services, Linaro On 10 September 2014 14:15, Viswanath Puttagunta < viswanath.puttagunta at linaro.org> wrote:> libvorbis does not currently have any simd/vectorization. > > Following patches add generic framework for simd/vectorization > and on top, add ARM-NEON simd vectorization using intrinsics. > > I was able to get over 34% performance improvement on my > Beaglebone Black which is single Cortex-A8 based CPU. > > You can find more information on metrics and procedure I used > to measure at > > https://wiki.linaro.org/WorkingGroups/Middleware/Graphics/MediaLibs/libvorbis > > As described in above link/doc, you can see my work-in-progress > tree at > https://git.linaro.org/people/viswanath.puttagunta/vorbis.git > > Also what standard should one follow when submitting patches > to libvorbis? I'm pretty sure linux kernel checkpatch.pl will > fail all patches for libvorbis. Please advise. > > Viswanath Puttagunta (3): > lib/simd: Introduce vectorization framework for libvorbis > lib/block.c: Use optimized routine for wave_operation > mdct: implement arm simd implementation for mdct > > configure.ac | 5 ++ > lib/Makefile.am | 6 +- > lib/block.c | 14 ++--- > lib/mdct.c | 102 +------------------------------- > lib/simd/Makefile.am | 9 +++ > lib/simd/neon_simd.c | 158 > ++++++++++++++++++++++++++++++++++++++++++++++++++ > lib/simd/no_simd.c | 129 +++++++++++++++++++++++++++++++++++++++++ > lib/simd/simd.h | 40 +++++++++++++ > 8 files changed, 351 insertions(+), 112 deletions(-) > create mode 100644 lib/simd/Makefile.am > create mode 100644 lib/simd/neon_simd.c > create mode 100644 lib/simd/no_simd.c > create mode 100644 lib/simd/simd.h > > -- > 1.7.9.5 > >-------------- next part -------------- An HTML attachment was scrubbed... URL: http://lists.xiph.org/pipermail/vorbis-dev/attachments/20140923/36174ae8/attachment.htm