thr3ads.net - llvm dev - [LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW [Jul 2012]

If this information is useful, please help other people find it:
Share via:

Anthony Blake

2012-Jul-06 04:06 UTC

[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

Hi,

I've noticed that LLVM tends to generate suboptimal code and spill an
excessive amount of registers in large functions, such as in those
that are automatically generated by FFTW.

LLVM generates good code for a function that computes an 8-point
complex FFT, but from 16-point upwards, icc or gcc generates much
better code. Here is an example of a sequence of instructions from a
32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE:

        [...]
	movaps	32(%rdi), %xmm3
	movaps	48(%rdi), %xmm2
	movaps	%xmm3, %xmm1     ### <-- xmm3 mov'ed into xmm1
	movaps	%xmm3, %xmm4     ### <-- xmm3 mov'ed into xmm4
	addps	%xmm0, %xmm1
	movaps	%xmm1, -16(%rbp)        ## 16-byte Spill
	movaps	144(%rdi), %xmm3   ### <-- new data mov'ed into xmm3
        [...]

xmm3 loaded, duplicated into 2 registers, and then discarded as other
data is loaded into it. Can anyone shed some light on why this might
be happening?

Below is the code for a 32-point FFT that generated the above
instructions. Sorry I can't supply a smaller example, but this only
begins to happen with functions of this size.

Regards,
Anthony

//////////////////////  CODE /////////////////////////////////////////

#include <xmmintrin.h>

#define __INLINE static inline __attribute__((always_inline))
#define LOAD _mm_load_ps
#define STORE _mm_store_ps
#define ADD _mm_add_ps
#define SUB _mm_sub_ps
#define MULT _mm_mul_ps
#define STREAM _mm_stream_ps
#define SHUF _mm_shuffle_ps
#define VLIT4(a,b,c,d) _mm_set_ps(a,b,c,d)
#define SWAP(d) SHUF(d,d,_MM_SHUFFLE(2,3,0,1))
#define UNPACK2LO(a,b) SHUF(a,b,_MM_SHUFFLE(1,0,1,0))
#define UNPACK2HI(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,3,2))
#define HALFBLEND(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,1,0))

__INLINE void TX2(__m128 *a, __m128 *b) {
    __m128 TX2_t0 = UNPACK2LO(*a, *b);
		__m128 TX2_t1 = UNPACK2HI(*a,*b);
    *a = TX2_t0; *b = TX2_t1;
}
__INLINE void FMA(__m128 *Rd, __m128 Rn, __m128 Rm) {	*Rd = ADD(*Rd,
MULT(Rn,Rm)); }
__INLINE void FMS(__m128 *Rd, __m128 Rn, __m128 Rm) {	*Rd = SUB(*Rd,
MULT(Rn,Rm)); }
__INLINE __m128 SIGNIFY(__m128 a) {
	__m128 t1;
	t1 = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
	return t1;
}
__INLINE __m128 MULI(__m128 a) { 	return SWAP(SIGNIFY(a)); }
__INLINE __m128 MUL(__m128 d, __m128 re, __m128 im) {
  re = MULT(re, d);
  FMS(&re, im, SWAP(d));
  return re;
}
__INLINE __m128 MULJ(__m128 d, __m128 re, __m128 im) {
   re = MULT(re, d);
	 FMA(&re, im, SWAP(d));
  return re;
}
__INLINE void S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, float
*o0, float *o1, float *o2, float *o3) { STORE(o0, r0);  STORE(o1, r1);
 STORE(o2, r2);  STORE(o3, r3); }
__INLINE void K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 uk, uk2, zk, zk_d;
	uk   = *r0;
	uk2  = *r1;
  zk   = ADD(*r2, *r3);
  zk_d = MULI(SUB(*r2, *r3));
  *r0 = ADD(uk, zk);
  *r2 = SUB(uk, zk);
  *r1 = SUB(uk2, zk_d);
  *r3 = ADD(uk2, zk_d);
}
__INLINE void K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128
*r2, __m128 *r3) {
	__m128 uk, uk2, zk_p, zk_n, zk, zk_d;

	uk   = *r0;
	uk2  = *r1;
  zk_p = MUL(*r2, re, im);
  zk_n = MULJ(*r3, re, im);
  zk   = ADD(zk_p, zk_n);
  zk_d = MULI(SUB(zk_p, zk_n));

  *r2 = SUB(uk, zk);
  *r0 = ADD(uk, zk);
  *r3 = ADD(uk2, zk_d);
  *r1 = SUB(uk2, zk_d);
}

__INLINE void L_4_4(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t2 = LOAD(i2);
  t3 = LOAD(i3);
  t4 = ADD(t0, t1);
  t5 = SUB(t0, t1);
  t6 = ADD(t2, t3);
  t7 = MULI(SUB(t2, t3));
  t0 = ADD(t4, t6);
  t2 = SUB(t4, t6);
  t1 = SUB(t5, t7);
  t3 = ADD(t5, t7);
    TX2(&t0,&t1);
    TX2(&t2,&t3);
    *r0 = t0;*r2 = t1;*r1 = t2;*r3 = t3;
}
__INLINE void L_2_2(const float *i0, const float *i1, const float *i2,
const float *i3,  __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t2 = LOAD(i2);
  t3 = LOAD(i3);

    t4 = ADD(t0, t1);
    t5 = SUB(t0, t1);
    t6 = ADD(t2, t3);
    t7 = SUB(t2, t3);
    TX2(&t4,&t5);
    TX2(&t6,&t7);
    *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7;

}
__INLINE void L_4_2(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t6 = LOAD(i2);
  t7 = LOAD(i3);
  t2 = HALFBLEND(t6, t7);
  t3 = HALFBLEND(t7, t6);

t4 = ADD(t0, t1);
  t5 = SUB(t0, t1);
  t6 = ADD(t2, t3);
  t7 = SUB(t2, t3);
  *r2 = UNPACK2HI(t4, t5);
  *r3 = UNPACK2HI(t6, t7);
 	t7 = MULI(t7);
  t0 = ADD(t4, t6);
  t2 = SUB(t4, t6);
  t1 = SUB(t5, t7);
  t3 = ADD(t5, t7);
  *r0 = UNPACK2LO(t0, t1);
  *r1 = UNPACK2LO(t2, t3);
}

void fft32(const float *in, float *out) {
  __m128
r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;

  L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
  L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
 
K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
  L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
 
L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
 
K_N(VLIT4(0.9238,0.9238,1,1),VLIT4(0.3826,-0.3826,0,-0),&r0_1,&r4_5,&r8_9,&r12_13);
 
K_N(VLIT4(0.3826,0.3826,0.7071,0.7071),VLIT4(0.9238,-0.9238,0.7071,-0.7071),&r2_3,&r6_7,&r10_11,&r14_15);
 
K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
 
K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r24_25,&r26_27,&r28_29,&r30_31);
 
K_N(VLIT4(0.9807,0.9807,1,1),VLIT4(0.1950,-0.1950,0,-0),&r0_1,&r8_9,&r16_17,&r24_25);
  S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
 
K_N(VLIT4(0.8314,0.8314,0.9238,0.9238),VLIT4(0.5555,-0.5555,0.3826,-0.3826),&r2_3,&r10_11,&r18_19,&r26_27);
  S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
 
K_N(VLIT4(0.5555,0.5555,0.7071,0.7071),VLIT4(0.8314,-0.8314,0.7071,-0.7071),&r4_5,&r12_13,&r20_21,&r28_29);
  S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
 
K_N(VLIT4(0.1950,0.1950,0.3826,0.3826),VLIT4(0.9807,-0.9807,0.9238,-0.9238),&r6_7,&r14_15,&r22_23,&r30_31);
  S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);

}

Jakob Stoklund Olesen

2012-Jul-06 06:39 UTC

head link

[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

On Jul 5, 2012, at 9:06 PM, Anthony Blake <amb33 at cs.waikato.ac.nz>
wrote:
> I've noticed that LLVM tends to generate suboptimal code and spill an
> excessive amount of registers in large functions, such as in those
> that are automatically generated by FFTW.
One problem might be that we're forcing the 16 stores to the out array to
happen in source order, which constrains the schedule. The stores are clearly
non-aliasing.
> LLVM generates good code for a function that computes an 8-point
> complex FFT, but from 16-point upwards, icc or gcc generates much
> better code. Here is an example of a sequence of instructions from a
> 32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE:
> 
>        [...]
> 	movaps	32(%rdi), %xmm3
> 	movaps	48(%rdi), %xmm2
> 	movaps	%xmm3, %xmm1     ### <-- xmm3 mov'ed into xmm1
> 	movaps	%xmm3, %xmm4     ### <-- xmm3 mov'ed into xmm4
> 	addps	%xmm0, %xmm1
> 	movaps	%xmm1, -16(%rbp)        ## 16-byte Spill
> 	movaps	144(%rdi), %xmm3   ### <-- new data mov'ed into xmm3
>        [...]
> 
> xmm3 loaded, duplicated into 2 registers, and then discarded as other
> data is loaded into it. Can anyone shed some light on why this might
> be happening?
I'm not actually seeing this behavior on trunk.

/jakob

Anthony Blake

2012-Jul-06 12:25 UTC

head link

[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

On Fri, Jul 6, 2012 at 6:39 PM, Jakob Stoklund Olesen <stoklund at 2pi.dk>
wrote:>
> On Jul 5, 2012, at 9:06 PM, Anthony Blake <amb33 at cs.waikato.ac.nz>
wrote:
>
>> I've noticed that LLVM tends to generate suboptimal code and spill
an
>> excessive amount of registers in large functions, such as in those
>> that are automatically generated by FFTW.
>
> One problem might be that we're forcing the 16 stores to the out array
to happen in source order, which constrains the schedule. The stores are clearly
non-aliasing.
>
>> LLVM generates good code for a function that computes an 8-point
>> complex FFT, but from 16-point upwards, icc or gcc generates much
>> better code. Here is an example of a sequence of instructions from a
>> 32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE:
>>
>>        [...]
>>       movaps  32(%rdi), %xmm3
>>       movaps  48(%rdi), %xmm2
>>       movaps  %xmm3, %xmm1     ### <-- xmm3 mov'ed into xmm1
>>       movaps  %xmm3, %xmm4     ### <-- xmm3 mov'ed into xmm4
>>       addps   %xmm0, %xmm1
>>       movaps  %xmm1, -16(%rbp)        ## 16-byte Spill
>>       movaps  144(%rdi), %xmm3   ### <-- new data mov'ed into
xmm3
>>        [...]
>>
>> xmm3 loaded, duplicated into 2 registers, and then discarded as other
>> data is loaded into it. Can anyone shed some light on why this might
>> be happening?
>
> I'm not actually seeing this behavior on trunk.
>
I've just tried trunk, and although behavior like above isn't
immediately obvious, trunk generates more instructions and spills more
registers compared to 3.1.

amb

Seemingly Similar Threads

proportions confidence intervals

llvm dev - Jul 2012 - [LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

Seemingly Similar Threads