thr3ads.net - llvm dev - [LLVMdev] Unexpected spilling of vector register during lane extraction on some x86

If this information is useful, please help other people find it:
Share via:

martin krastev

2014-Oct-13 16:13 UTC

[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

Hello,

Depending on how I extract integer lanes from an x86_64 xmm register, the
backend may spill that register in order to load scalars. The effect was
observed on two targets: corei7-avx and btver1 (I haven't checked other
targets).

Here's a test case with spilling/no-spilling code put on conditional
compile:

#if __SSE4_1__ != 0
#include <smmintrin.h>
#else
#include <emmintrin.h>
#endif
#include <stdint.h>
#include <assert.h>

#if SPILLING_ENSUES == 1
static int32_t geti(const __m128i v, const size_t i)
{
switch (i) {
case 0:
return _mm_cvtsi128_si32(v);
case 1:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
case 2:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
case 3:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
}

assert(0);
return -1;
}

#else
static int32_t geti(const __m128i v, const size_t i)
{
switch (i) {
case 0:
return int32_t(v[0] >> 0);
case 1:
return int32_t(v[0] >> 32);
case 2:
return int32_t(v[1] >> 0);
case 3:
return int32_t(v[1] >> 32);
}

assert(0);
return -1;
}
#endif

__m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
__m128 r[1];

static const float table[3] = {
1.0,
2.0,
4.0,
};

static __m128 testee(
const __m128 x)
{
const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23),
_mm_set1_epi32(127));
const __m128 s = _mm_or_ps(
_mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x),
              _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23)));

const __m128 exp = _mm_cvtepi32_ps(iexp);
const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f)));
const __m128i rem  = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
_mm_set1_epi32(0x10003)));

const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending on
version of lane extractor used
table[geti(rem, 0)],
table[geti(rem, 1)],
table[geti(rem, 2)],
table[geti(rem, 3)]);

return _mm_set1_ps(.5f) * entry;
}

int main(int argc, char** argv)
{
r[0] = testee(x[0]);
return 0;
}


In the above function 'testee' (duly inlined in the disassembly below),
local var 'rem' gets spilled and read back as scalars, depending on
which
version of the integer lane accessor was used.

Output from clang 3.4 for target corei7-avx:

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
  4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
  4004f8:   vpsrld $0x17,%xmm0,%xmm0
  4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
<__dso_handle+0x8>
  400505:   vcvtdq2ps %xmm0,%xmm1
  400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
<__dso_handle+0x18>
  400511:   vcvttps2dq %xmm1,%xmm1
  400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
<__dso_handle+0x28>
  40051d:   vpsubd %xmm1,%xmm0,%xmm0
  400521:   vmovq  %xmm0,%rax
  400526:   movslq %eax,%rcx
  400529:   sar    $0x20,%rax
  40052d:   vpextrq $0x1,%xmm0,%rdx
  400533:   movslq %edx,%rsi
  400536:   sar    $0x20,%rdx
  40053a:   vmovss 0x4006c0(,%rcx,4),%xmm0
  400543:   vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
  40054e:   vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
  400559:   vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
  400564:   vmulps 0x144(%rip),%xmm0,%xmm0        # 4006b0
<__dso_handle+0x38>
  40056c:   vmovaps %xmm0,0x20046c(%rip)        # 6009e0 <r>
  400574:   xor    %eax,%eax
  400576:   retq

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
  4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
  4004f8:   vpsrld $0x17,%xmm0,%xmm0
  4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
<__dso_handle+0x8>
  400505:   vcvtdq2ps %xmm0,%xmm1
  400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
<__dso_handle+0x18>
  400511:   vcvttps2dq %xmm1,%xmm1
  400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
<__dso_handle+0x28>
  40051d:   vpsubd %xmm1,%xmm0,%xmm0
  400521:   vmovdqa %xmm0,-0x18(%rsp)
  400527:   movslq -0x18(%rsp),%rax
  40052c:   movslq -0x14(%rsp),%rcx
  400531:   movslq -0x10(%rsp),%rdx
  400536:   movslq -0xc(%rsp),%rsi
  40053b:   vmovss 0x4006c0(,%rax,4),%xmm0
  400544:   vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
  40054f:   vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
  40055a:   vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
  400565:   vmulps 0x143(%rip),%xmm0,%xmm0        # 4006b0
<__dso_handle+0x38>
  40056d:   vmovaps %xmm0,0x20046b(%rip)        # 6009e0 <r>
  400575:   xor    %eax,%eax
  400577:   retq


Output from clang pre-release 3.5 trunk for target btver1:

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004005c0 <main>:
  4005c0: movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
  4005c8: psrld  $0x17,%xmm0
  4005cd: paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
  4005d5: cvtdq2ps %xmm0,%xmm1
  4005d8: divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
  4005df: cvttps2dq %xmm1,%xmm1
  4005e3: pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
  4005eb: psubd  %xmm1,%xmm0
  4005ef: movq   %xmm0,%rax
  4005f4: movslq %eax,%rcx
  4005f7: sar    $0x20,%rax
  4005fb: punpckhqdq %xmm0,%xmm0
  4005ff: movq   %xmm0,%rdx
  400604: movslq %edx,%rsi
  400607: sar    $0x20,%rdx
  40060b: movss  0x400740(,%rax,4),%xmm0
  400614: movss  0x400740(,%rdx,4),%xmm1
  40061d: unpcklps %xmm1,%xmm0
  400620: movss  0x400740(,%rcx,4),%xmm1
  400629: movss  0x400740(,%rsi,4),%xmm2
  400632: unpcklps %xmm2,%xmm1
  400635: unpcklps %xmm0,%xmm1
  400638: mulps  0xf1(%rip),%xmm1        # 400730 <.LCPI0_3>
  40063f: movaps %xmm1,0x1a1a(%rip)        # 402060 <r>
  400646: xor    %eax,%eax
  400648: retq

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004005c0 <main>:
  4005c0: movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
  4005c8: psrld  $0x17,%xmm0
  4005cd: paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
  4005d5: cvtdq2ps %xmm0,%xmm1
  4005d8: divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
  4005df: cvttps2dq %xmm1,%xmm1
  4005e3: pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
  4005eb: psubd  %xmm1,%xmm0
  4005ef: movdqa %xmm0,-0x18(%rsp)
  4005f5: movslq -0x18(%rsp),%rax
  4005fa: movslq -0x14(%rsp),%rcx
  4005ff: movslq -0x10(%rsp),%rdx
  400604: movslq -0xc(%rsp),%rsi
  400609: movss  0x400740(,%rsi,4),%xmm0
  400612: movss  0x400740(,%rcx,4),%xmm1
  40061b: unpcklps %xmm0,%xmm1
  40061e: movss  0x400740(,%rdx,4),%xmm0
  400627: movss  0x400740(,%rax,4),%xmm2
  400630: unpcklps %xmm0,%xmm2
  400633: unpcklps %xmm1,%xmm2
  400636: mulps  0xf3(%rip),%xmm2        # 400730 <.LCPI0_3>
  40063d: movaps %xmm2,0x1a1c(%rip)        # 402060 <r>
  400644: xor    %eax,%eax
  400646: retq


Is that behavior expected? Because I find it odd.

Best regards,
Martin
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20141013/6ff9a867/attachment.html>

Quentin Colombet

2014-Oct-13 17:03 UTC

head link

[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

Hi Martin,

I haven’t checked what is going on here, but if you believe some spill can be
avoided, this is worth filing a PR (www.llvm.org/bugs) to libraries ->
Register Allocator.
Please attach the IR to reproduce the problem (-emit-llvm from clang).

Thanks,
-Quentin

On Oct 13, 2014, at 9:13 AM, martin krastev <blu.dark at gmail.com> wrote:
> Hello,
> 
> Depending on how I extract integer lanes from an x86_64 xmm register, the
backend may spill that register in order to load scalars. The effect was
observed on two targets: corei7-avx and btver1 (I haven't checked other
targets).
> 
> Here's a test case with spilling/no-spilling code put on conditional
compile:
> 
> #if __SSE4_1__ != 0
> 	#include <smmintrin.h>
> #else
> 	#include <emmintrin.h>
> #endif
> #include <stdint.h>
> #include <assert.h>
> 
> #if SPILLING_ENSUES == 1
> static int32_t geti(const __m128i v, const size_t i)
> {
> 	switch (i) {
> 	case 0:
> 		return _mm_cvtsi128_si32(v);
> 	case 1:
> 		return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
> 	case 2:
> 		return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
> 	case 3:
> 		return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
> 	}
> 
> 	assert(0);
> 	return -1;
> }
> 
> #else
> static int32_t geti(const __m128i v, const size_t i)
> {
> 	switch (i) {
> 	case 0:
> 		return int32_t(v[0] >> 0);
> 	case 1:
> 		return int32_t(v[0] >> 32);
> 	case 2:
> 		return int32_t(v[1] >> 0);
> 	case 3:
> 		return int32_t(v[1] >> 32);
> 	}
> 
> 	assert(0);
> 	return -1;
> }
> #endif
> 
> __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
> __m128 r[1];
> 
> static const float table[3] = {
> 	1.0,
> 	2.0,
> 	4.0,
> };
> 
> static __m128 testee(
> 	const __m128 x)
> {
> 	const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x),
23), _mm_set1_epi32(127));
> 	const __m128 s = _mm_or_ps(
> 		_mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x),
> 		              _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23)));
> 
> 	const __m128 exp = _mm_cvtepi32_ps(iexp);
> 	const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f)));
> 	const __m128i rem  = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
_mm_set1_epi32(0x10003)));
> 
> 	const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending
on version of lane extractor used
> 		table[geti(rem, 0)],
> 		table[geti(rem, 1)],
> 		table[geti(rem, 2)],
> 		table[geti(rem, 3)]);
> 
> 	return _mm_set1_ps(.5f) * entry;
> }
> 
> int main(int argc, char** argv)
> {
> 	r[0] = testee(x[0]);
> 	return 0;
> }
> 
> 
> In the above function 'testee' (duly inlined in the disassembly
below), local var 'rem' gets spilled and read back as scalars, depending
on which version of the integer lane accessor was used.
> 
> Output from clang 3.4 for target corei7-avx:
> 
> $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
> $ objdump -dC --no-show-raw-insn ./a.out
> ...
> 00000000004004f0 <main>:
>   4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
>   4004f8:   vpsrld $0x17,%xmm0,%xmm0
>   4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
<__dso_handle+0x8>
>   400505:   vcvtdq2ps %xmm0,%xmm1
>   400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
<__dso_handle+0x18>
>   400511:   vcvttps2dq %xmm1,%xmm1
>   400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
<__dso_handle+0x28>
>   40051d:   vpsubd %xmm1,%xmm0,%xmm0
>   400521:   vmovq  %xmm0,%rax
>   400526:   movslq %eax,%rcx
>   400529:   sar    $0x20,%rax
>   40052d:   vpextrq $0x1,%xmm0,%rdx
>   400533:   movslq %edx,%rsi
>   400536:   sar    $0x20,%rdx
>   40053a:   vmovss 0x4006c0(,%rcx,4),%xmm0
>   400543:   vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
>   40054e:   vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
>   400559:   vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
>   400564:   vmulps 0x144(%rip),%xmm0,%xmm0        # 4006b0
<__dso_handle+0x38>
>   40056c:   vmovaps %xmm0,0x20046c(%rip)        # 6009e0 <r>
>   400574:   xor    %eax,%eax
>   400576:   retq
> 
> $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
> $ objdump -dC --no-show-raw-insn ./a.out
> ...
> 00000000004004f0 <main>:
>   4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
>   4004f8:   vpsrld $0x17,%xmm0,%xmm0
>   4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
<__dso_handle+0x8>
>   400505:   vcvtdq2ps %xmm0,%xmm1
>   400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
<__dso_handle+0x18>
>   400511:   vcvttps2dq %xmm1,%xmm1
>   400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
<__dso_handle+0x28>
>   40051d:   vpsubd %xmm1,%xmm0,%xmm0
>   400521:   vmovdqa %xmm0,-0x18(%rsp)
>   400527:   movslq -0x18(%rsp),%rax
>   40052c:   movslq -0x14(%rsp),%rcx
>   400531:   movslq -0x10(%rsp),%rdx
>   400536:   movslq -0xc(%rsp),%rsi
>   40053b:   vmovss 0x4006c0(,%rax,4),%xmm0
>   400544:   vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
>   40054f:   vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
>   40055a:   vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
>   400565:   vmulps 0x143(%rip),%xmm0,%xmm0        # 4006b0
<__dso_handle+0x38>
>   40056d:   vmovaps %xmm0,0x20046b(%rip)        # 6009e0 <r>
>   400575:   xor    %eax,%eax
>   400577:   retq
> 
> 
> Output from clang pre-release 3.5 trunk for target btver1:
> 
> $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
> $ objdump -dC --no-show-raw-insn ./a.out
> ...
> 00000000004005c0 <main>:
>   4005c0:	movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
>   4005c8:	psrld  $0x17,%xmm0
>   4005cd:	paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
>   4005d5:	cvtdq2ps %xmm0,%xmm1
>   4005d8:	divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
>   4005df:	cvttps2dq %xmm1,%xmm1
>   4005e3:	pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
>   4005eb:	psubd  %xmm1,%xmm0
>   4005ef:	movq   %xmm0,%rax
>   4005f4:	movslq %eax,%rcx
>   4005f7:	sar    $0x20,%rax
>   4005fb:	punpckhqdq %xmm0,%xmm0
>   4005ff:	movq   %xmm0,%rdx
>   400604:	movslq %edx,%rsi
>   400607:	sar    $0x20,%rdx
>   40060b:	movss  0x400740(,%rax,4),%xmm0
>   400614:	movss  0x400740(,%rdx,4),%xmm1
>   40061d:	unpcklps %xmm1,%xmm0
>   400620:	movss  0x400740(,%rcx,4),%xmm1
>   400629:	movss  0x400740(,%rsi,4),%xmm2
>   400632:	unpcklps %xmm2,%xmm1
>   400635:	unpcklps %xmm0,%xmm1
>   400638:	mulps  0xf1(%rip),%xmm1        # 400730 <.LCPI0_3>
>   40063f:	movaps %xmm1,0x1a1a(%rip)        # 402060 <r>
>   400646:	xor    %eax,%eax
>   400648:	retq
> 
> $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
> $ objdump -dC --no-show-raw-insn ./a.out
> ...
> 00000000004005c0 <main>:
>   4005c0:	movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
>   4005c8:	psrld  $0x17,%xmm0
>   4005cd:	paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
>   4005d5:	cvtdq2ps %xmm0,%xmm1
>   4005d8:	divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
>   4005df:	cvttps2dq %xmm1,%xmm1
>   4005e3:	pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
>   4005eb:	psubd  %xmm1,%xmm0
>   4005ef:	movdqa %xmm0,-0x18(%rsp)
>   4005f5:	movslq -0x18(%rsp),%rax
>   4005fa:	movslq -0x14(%rsp),%rcx
>   4005ff:	movslq -0x10(%rsp),%rdx
>   400604:	movslq -0xc(%rsp),%rsi
>   400609:	movss  0x400740(,%rsi,4),%xmm0
>   400612:	movss  0x400740(,%rcx,4),%xmm1
>   40061b:	unpcklps %xmm0,%xmm1
>   40061e:	movss  0x400740(,%rdx,4),%xmm0
>   400627:	movss  0x400740(,%rax,4),%xmm2
>   400630:	unpcklps %xmm0,%xmm2
>   400633:	unpcklps %xmm1,%xmm2
>   400636:	mulps  0xf3(%rip),%xmm2        # 400730 <.LCPI0_3>
>   40063d:	movaps %xmm2,0x1a1c(%rip)        # 402060 <r>
>   400644:	xor    %eax,%eax
>   400646:	retq
> 
> 
> Is that behavior expected? Because I find it odd.
> 
> Best regards,
> Martin
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

martin krastev

2014-Oct-14 08:36 UTC

head link

[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

Hi Quentin,

Thank you for the directions. Here is the bug ticket:
http://llvm.org/bugs/show_bug.cgi?id=21269


Best regards,
Martin

On Mon, Oct 13, 2014 at 8:03 PM, Quentin Colombet <qcolombet at apple.com>
wrote:
> Hi Martin,
>
> I haven’t checked what is going on here, but if you believe some spill can
> be avoided, this is worth filing a PR (www.llvm.org/bugs) to libraries
->
> Register Allocator.
> Please attach the IR to reproduce the problem (-emit-llvm from clang).
>
> Thanks,
> -Quentin
>
> On Oct 13, 2014, at 9:13 AM, martin krastev <blu.dark at gmail.com>
wrote:
>
> > Hello,
> >
> > Depending on how I extract integer lanes from an x86_64 xmm register,
> the backend may spill that register in order to load scalars. The effect
> was observed on two targets: corei7-avx and btver1 (I haven't checked
other
> targets).
> >
> > Here's a test case with spilling/no-spilling code put on
conditional
> compile:
> >
> > #if __SSE4_1__ != 0
> >       #include <smmintrin.h>
> > #else
> >       #include <emmintrin.h>
> > #endif
> > #include <stdint.h>
> > #include <assert.h>
> >
> > #if SPILLING_ENSUES == 1
> > static int32_t geti(const __m128i v, const size_t i)
> > {
> >       switch (i) {
> >       case 0:
> >               return _mm_cvtsi128_si32(v);
> >       case 1:
> >               return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
> >       case 2:
> >               return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
> >       case 3:
> >               return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
> >       }
> >
> >       assert(0);
> >       return -1;
> > }
> >
> > #else
> > static int32_t geti(const __m128i v, const size_t i)
> > {
> >       switch (i) {
> >       case 0:
> >               return int32_t(v[0] >> 0);
> >       case 1:
> >               return int32_t(v[0] >> 32);
> >       case 2:
> >               return int32_t(v[1] >> 0);
> >       case 3:
> >               return int32_t(v[1] >> 32);
> >       }
> >
> >       assert(0);
> >       return -1;
> > }
> > #endif
> >
> > __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
> > __m128 r[1];
> >
> > static const float table[3] = {
> >       1.0,
> >       2.0,
> >       4.0,
> > };
> >
> > static __m128 testee(
> >       const __m128 x)
> > {
> >       const __m128i iexp >
_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127));
> >       const __m128 s = _mm_or_ps(
> >               _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff
<<
> 23)), x),
> >                             _mm_castsi128_ps(_mm_set1_epi32(0x7f
<<
> 23)));
> >
> >       const __m128 exp = _mm_cvtepi32_ps(iexp);
> >       const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp,
> _mm_set1_ps(3.f)));
> >       const __m128i rem  = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
> _mm_set1_epi32(0x10003)));
> >
> >       const __m128 entry = _mm_setr_ps( // 'rem' gets spilled
depending
> on version of lane extractor used
> >               table[geti(rem, 0)],
> >               table[geti(rem, 1)],
> >               table[geti(rem, 2)],
> >               table[geti(rem, 3)]);
> >
> >       return _mm_set1_ps(.5f) * entry;
> > }
> >
> > int main(int argc, char** argv)
> > {
> >       r[0] = testee(x[0]);
> >       return 0;
> > }
> >
> >
> > In the above function 'testee' (duly inlined in the
disassembly below),
> local var 'rem' gets spilled and read back as scalars, depending on
which
> version of the integer lane accessor was used.
> >
> > Output from clang 3.4 for target corei7-avx:
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004004f0 <main>:
> >   4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
> >   4004f8:   vpsrld $0x17,%xmm0,%xmm0
> >   4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
> <__dso_handle+0x8>
> >   400505:   vcvtdq2ps %xmm0,%xmm1
> >   400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
> <__dso_handle+0x18>
> >   400511:   vcvttps2dq %xmm1,%xmm1
> >   400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
> <__dso_handle+0x28>
> >   40051d:   vpsubd %xmm1,%xmm0,%xmm0
> >   400521:   vmovq  %xmm0,%rax
> >   400526:   movslq %eax,%rcx
> >   400529:   sar    $0x20,%rax
> >   40052d:   vpextrq $0x1,%xmm0,%rdx
> >   400533:   movslq %edx,%rsi
> >   400536:   sar    $0x20,%rdx
> >   40053a:   vmovss 0x4006c0(,%rcx,4),%xmm0
> >   400543:   vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
> >   40054e:   vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
> >   400559:   vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
> >   400564:   vmulps 0x144(%rip),%xmm0,%xmm0        # 4006b0
> <__dso_handle+0x38>
> >   40056c:   vmovaps %xmm0,0x20046c(%rip)        # 6009e0 <r>
> >   400574:   xor    %eax,%eax
> >   400576:   retq
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004004f0 <main>:
> >   4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
> >   4004f8:   vpsrld $0x17,%xmm0,%xmm0
> >   4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
> <__dso_handle+0x8>
> >   400505:   vcvtdq2ps %xmm0,%xmm1
> >   400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
> <__dso_handle+0x18>
> >   400511:   vcvttps2dq %xmm1,%xmm1
> >   400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
> <__dso_handle+0x28>
> >   40051d:   vpsubd %xmm1,%xmm0,%xmm0
> >   400521:   vmovdqa %xmm0,-0x18(%rsp)
> >   400527:   movslq -0x18(%rsp),%rax
> >   40052c:   movslq -0x14(%rsp),%rcx
> >   400531:   movslq -0x10(%rsp),%rdx
> >   400536:   movslq -0xc(%rsp),%rsi
> >   40053b:   vmovss 0x4006c0(,%rax,4),%xmm0
> >   400544:   vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
> >   40054f:   vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
> >   40055a:   vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
> >   400565:   vmulps 0x143(%rip),%xmm0,%xmm0        # 4006b0
> <__dso_handle+0x38>
> >   40056d:   vmovaps %xmm0,0x20046b(%rip)        # 6009e0 <r>
> >   400575:   xor    %eax,%eax
> >   400577:   retq
> >
> >
> > Output from clang pre-release 3.5 trunk for target btver1:
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004005c0 <main>:
> >   4005c0:     movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
> >   4005c8:     psrld  $0x17,%xmm0
> >   4005cd:     paddd  0x12b(%rip),%xmm0        # 400700
<.LCPI0_0>
> >   4005d5:     cvtdq2ps %xmm0,%xmm1
> >   4005d8:     divps  0x131(%rip),%xmm1        # 400710
<.LCPI0_1>
> >   4005df:     cvttps2dq %xmm1,%xmm1
> >   4005e3:     pmullw 0x135(%rip),%xmm1        # 400720
<.LCPI0_2>
> >   4005eb:     psubd  %xmm1,%xmm0
> >   4005ef:     movq   %xmm0,%rax
> >   4005f4:     movslq %eax,%rcx
> >   4005f7:     sar    $0x20,%rax
> >   4005fb:     punpckhqdq %xmm0,%xmm0
> >   4005ff:     movq   %xmm0,%rdx
> >   400604:     movslq %edx,%rsi
> >   400607:     sar    $0x20,%rdx
> >   40060b:     movss  0x400740(,%rax,4),%xmm0
> >   400614:     movss  0x400740(,%rdx,4),%xmm1
> >   40061d:     unpcklps %xmm1,%xmm0
> >   400620:     movss  0x400740(,%rcx,4),%xmm1
> >   400629:     movss  0x400740(,%rsi,4),%xmm2
> >   400632:     unpcklps %xmm2,%xmm1
> >   400635:     unpcklps %xmm0,%xmm1
> >   400638:     mulps  0xf1(%rip),%xmm1        # 400730 <.LCPI0_3>
> >   40063f:     movaps %xmm1,0x1a1a(%rip)        # 402060 <r>
> >   400646:     xor    %eax,%eax
> >   400648:     retq
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004005c0 <main>:
> >   4005c0:     movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
> >   4005c8:     psrld  $0x17,%xmm0
> >   4005cd:     paddd  0x12b(%rip),%xmm0        # 400700
<.LCPI0_0>
> >   4005d5:     cvtdq2ps %xmm0,%xmm1
> >   4005d8:     divps  0x131(%rip),%xmm1        # 400710
<.LCPI0_1>
> >   4005df:     cvttps2dq %xmm1,%xmm1
> >   4005e3:     pmullw 0x135(%rip),%xmm1        # 400720
<.LCPI0_2>
> >   4005eb:     psubd  %xmm1,%xmm0
> >   4005ef:     movdqa %xmm0,-0x18(%rsp)
> >   4005f5:     movslq -0x18(%rsp),%rax
> >   4005fa:     movslq -0x14(%rsp),%rcx
> >   4005ff:     movslq -0x10(%rsp),%rdx
> >   400604:     movslq -0xc(%rsp),%rsi
> >   400609:     movss  0x400740(,%rsi,4),%xmm0
> >   400612:     movss  0x400740(,%rcx,4),%xmm1
> >   40061b:     unpcklps %xmm0,%xmm1
> >   40061e:     movss  0x400740(,%rdx,4),%xmm0
> >   400627:     movss  0x400740(,%rax,4),%xmm2
> >   400630:     unpcklps %xmm0,%xmm2
> >   400633:     unpcklps %xmm1,%xmm2
> >   400636:     mulps  0xf3(%rip),%xmm2        # 400730 <.LCPI0_3>
> >   40063d:     movaps %xmm2,0x1a1c(%rip)        # 402060 <r>
> >   400644:     xor    %eax,%eax
> >   400646:     retq
> >
> >
> > Is that behavior expected? Because I find it odd.
> >
> > Best regards,
> > Martin
> > _______________________________________________
> > LLVM Developers mailing list
> > LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20141014/e7b43539/attachment.html>

Reasonably Related Threads

Search for more apparently analagous threads

llvm dev - Oct 2014 - [LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

Reasonably Related Threads