Anthony Blake
2012-Jul-06 04:06 UTC
[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW
Hi, I've noticed that LLVM tends to generate suboptimal code and spill an excessive amount of registers in large functions, such as in those that are automatically generated by FFTW. LLVM generates good code for a function that computes an 8-point complex FFT, but from 16-point upwards, icc or gcc generates much better code. Here is an example of a sequence of instructions from a 32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE: [...] movaps 32(%rdi), %xmm3 movaps 48(%rdi), %xmm2 movaps %xmm3, %xmm1 ### <-- xmm3 mov'ed into xmm1 movaps %xmm3, %xmm4 ### <-- xmm3 mov'ed into xmm4 addps %xmm0, %xmm1 movaps %xmm1, -16(%rbp) ## 16-byte Spill movaps 144(%rdi), %xmm3 ### <-- new data mov'ed into xmm3 [...] xmm3 loaded, duplicated into 2 registers, and then discarded as other data is loaded into it. Can anyone shed some light on why this might be happening? Below is the code for a 32-point FFT that generated the above instructions. Sorry I can't supply a smaller example, but this only begins to happen with functions of this size. Regards, Anthony ////////////////////// CODE ///////////////////////////////////////// #include <xmmintrin.h> #define __INLINE static inline __attribute__((always_inline)) #define LOAD _mm_load_ps #define STORE _mm_store_ps #define ADD _mm_add_ps #define SUB _mm_sub_ps #define MULT _mm_mul_ps #define STREAM _mm_stream_ps #define SHUF _mm_shuffle_ps #define VLIT4(a,b,c,d) _mm_set_ps(a,b,c,d) #define SWAP(d) SHUF(d,d,_MM_SHUFFLE(2,3,0,1)) #define UNPACK2LO(a,b) SHUF(a,b,_MM_SHUFFLE(1,0,1,0)) #define UNPACK2HI(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,3,2)) #define HALFBLEND(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,1,0)) __INLINE void TX2(__m128 *a, __m128 *b) { __m128 TX2_t0 = UNPACK2LO(*a, *b); __m128 TX2_t1 = UNPACK2HI(*a,*b); *a = TX2_t0; *b = TX2_t1; } __INLINE void FMA(__m128 *Rd, __m128 Rn, __m128 Rm) { *Rd = ADD(*Rd, MULT(Rn,Rm)); } __INLINE void FMS(__m128 *Rd, __m128 Rn, __m128 Rm) { *Rd = SUB(*Rd, MULT(Rn,Rm)); } __INLINE __m128 SIGNIFY(__m128 a) { __m128 t1; t1 = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); return t1; } __INLINE __m128 MULI(__m128 a) { return SWAP(SIGNIFY(a)); } __INLINE __m128 MUL(__m128 d, __m128 re, __m128 im) { re = MULT(re, d); FMS(&re, im, SWAP(d)); return re; } __INLINE __m128 MULJ(__m128 d, __m128 re, __m128 im) { re = MULT(re, d); FMA(&re, im, SWAP(d)); return re; } __INLINE void S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, float *o0, float *o1, float *o2, float *o3) { STORE(o0, r0); STORE(o1, r1); STORE(o2, r2); STORE(o3, r3); } __INLINE void K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { __m128 uk, uk2, zk, zk_d; uk = *r0; uk2 = *r1; zk = ADD(*r2, *r3); zk_d = MULI(SUB(*r2, *r3)); *r0 = ADD(uk, zk); *r2 = SUB(uk, zk); *r1 = SUB(uk2, zk_d); *r3 = ADD(uk2, zk_d); } __INLINE void K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { __m128 uk, uk2, zk_p, zk_n, zk, zk_d; uk = *r0; uk2 = *r1; zk_p = MUL(*r2, re, im); zk_n = MULJ(*r3, re, im); zk = ADD(zk_p, zk_n); zk_d = MULI(SUB(zk_p, zk_n)); *r2 = SUB(uk, zk); *r0 = ADD(uk, zk); *r3 = ADD(uk2, zk_d); *r1 = SUB(uk2, zk_d); } __INLINE void L_4_4(const float *i0, const float *i1, const float *i2, const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { __m128 t0, t1, t2, t3, t4, t5, t6, t7; t0 = LOAD(i0); t1 = LOAD(i1); t2 = LOAD(i2); t3 = LOAD(i3); t4 = ADD(t0, t1); t5 = SUB(t0, t1); t6 = ADD(t2, t3); t7 = MULI(SUB(t2, t3)); t0 = ADD(t4, t6); t2 = SUB(t4, t6); t1 = SUB(t5, t7); t3 = ADD(t5, t7); TX2(&t0,&t1); TX2(&t2,&t3); *r0 = t0;*r2 = t1;*r1 = t2;*r3 = t3; } __INLINE void L_2_2(const float *i0, const float *i1, const float *i2, const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { __m128 t0, t1, t2, t3, t4, t5, t6, t7; t0 = LOAD(i0); t1 = LOAD(i1); t2 = LOAD(i2); t3 = LOAD(i3); t4 = ADD(t0, t1); t5 = SUB(t0, t1); t6 = ADD(t2, t3); t7 = SUB(t2, t3); TX2(&t4,&t5); TX2(&t6,&t7); *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7; } __INLINE void L_4_2(const float *i0, const float *i1, const float *i2, const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { __m128 t0, t1, t2, t3, t4, t5, t6, t7; t0 = LOAD(i0); t1 = LOAD(i1); t6 = LOAD(i2); t7 = LOAD(i3); t2 = HALFBLEND(t6, t7); t3 = HALFBLEND(t7, t6); t4 = ADD(t0, t1); t5 = SUB(t0, t1); t6 = ADD(t2, t3); t7 = SUB(t2, t3); *r2 = UNPACK2HI(t4, t5); *r3 = UNPACK2HI(t6, t7); t7 = MULI(t7); t0 = ADD(t4, t6); t2 = SUB(t4, t6); t1 = SUB(t5, t7); t3 = ADD(t5, t7); *r0 = UNPACK2LO(t0, t1); *r1 = UNPACK2LO(t2, t3); } void fft32(const float *in, float *out) { __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19); L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23); K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r0_1,&r2_3,&r4_5,&r6_7); L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31); L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27); K_N(VLIT4(0.9238,0.9238,1,1),VLIT4(0.3826,-0.3826,0,-0),&r0_1,&r4_5,&r8_9,&r12_13); K_N(VLIT4(0.3826,0.3826,0.7071,0.7071),VLIT4(0.9238,-0.9238,0.7071,-0.7071),&r2_3,&r6_7,&r10_11,&r14_15); K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r16_17,&r18_19,&r20_21,&r22_23); K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r24_25,&r26_27,&r28_29,&r30_31); K_N(VLIT4(0.9807,0.9807,1,1),VLIT4(0.1950,-0.1950,0,-0),&r0_1,&r8_9,&r16_17,&r24_25); S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48); K_N(VLIT4(0.8314,0.8314,0.9238,0.9238),VLIT4(0.5555,-0.5555,0.3826,-0.3826),&r2_3,&r10_11,&r18_19,&r26_27); S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52); K_N(VLIT4(0.5555,0.5555,0.7071,0.7071),VLIT4(0.8314,-0.8314,0.7071,-0.7071),&r4_5,&r12_13,&r20_21,&r28_29); S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56); K_N(VLIT4(0.1950,0.1950,0.3826,0.3826),VLIT4(0.9807,-0.9807,0.9238,-0.9238),&r6_7,&r14_15,&r22_23,&r30_31); S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60); }
Jakob Stoklund Olesen
2012-Jul-06 06:39 UTC
[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW
On Jul 5, 2012, at 9:06 PM, Anthony Blake <amb33 at cs.waikato.ac.nz> wrote:> I've noticed that LLVM tends to generate suboptimal code and spill an > excessive amount of registers in large functions, such as in those > that are automatically generated by FFTW.One problem might be that we're forcing the 16 stores to the out array to happen in source order, which constrains the schedule. The stores are clearly non-aliasing.> LLVM generates good code for a function that computes an 8-point > complex FFT, but from 16-point upwards, icc or gcc generates much > better code. Here is an example of a sequence of instructions from a > 32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE: > > [...] > movaps 32(%rdi), %xmm3 > movaps 48(%rdi), %xmm2 > movaps %xmm3, %xmm1 ### <-- xmm3 mov'ed into xmm1 > movaps %xmm3, %xmm4 ### <-- xmm3 mov'ed into xmm4 > addps %xmm0, %xmm1 > movaps %xmm1, -16(%rbp) ## 16-byte Spill > movaps 144(%rdi), %xmm3 ### <-- new data mov'ed into xmm3 > [...] > > xmm3 loaded, duplicated into 2 registers, and then discarded as other > data is loaded into it. Can anyone shed some light on why this might > be happening?I'm not actually seeing this behavior on trunk. /jakob
Anthony Blake
2012-Jul-06 12:25 UTC
[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW
On Fri, Jul 6, 2012 at 6:39 PM, Jakob Stoklund Olesen <stoklund at 2pi.dk> wrote:> > On Jul 5, 2012, at 9:06 PM, Anthony Blake <amb33 at cs.waikato.ac.nz> wrote: > >> I've noticed that LLVM tends to generate suboptimal code and spill an >> excessive amount of registers in large functions, such as in those >> that are automatically generated by FFTW. > > One problem might be that we're forcing the 16 stores to the out array to happen in source order, which constrains the schedule. The stores are clearly non-aliasing. > >> LLVM generates good code for a function that computes an 8-point >> complex FFT, but from 16-point upwards, icc or gcc generates much >> better code. Here is an example of a sequence of instructions from a >> 32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE: >> >> [...] >> movaps 32(%rdi), %xmm3 >> movaps 48(%rdi), %xmm2 >> movaps %xmm3, %xmm1 ### <-- xmm3 mov'ed into xmm1 >> movaps %xmm3, %xmm4 ### <-- xmm3 mov'ed into xmm4 >> addps %xmm0, %xmm1 >> movaps %xmm1, -16(%rbp) ## 16-byte Spill >> movaps 144(%rdi), %xmm3 ### <-- new data mov'ed into xmm3 >> [...] >> >> xmm3 loaded, duplicated into 2 registers, and then discarded as other >> data is loaded into it. Can anyone shed some light on why this might >> be happening? > > I'm not actually seeing this behavior on trunk. >I've just tried trunk, and although behavior like above isn't immediately obvious, trunk generates more instructions and spills more registers compared to 3.1. amb