martin krastev
2014-Oct-13 16:13 UTC
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
Hello, Depending on how I extract integer lanes from an x86_64 xmm register, the backend may spill that register in order to load scalars. The effect was observed on two targets: corei7-avx and btver1 (I haven't checked other targets). Here's a test case with spilling/no-spilling code put on conditional compile: #if __SSE4_1__ != 0 #include <smmintrin.h> #else #include <emmintrin.h> #endif #include <stdint.h> #include <assert.h> #if SPILLING_ENSUES == 1 static int32_t geti(const __m128i v, const size_t i) { switch (i) { case 0: return _mm_cvtsi128_si32(v); case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5)); case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6)); case 3: return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7)); } assert(0); return -1; } #else static int32_t geti(const __m128i v, const size_t i) { switch (i) { case 0: return int32_t(v[0] >> 0); case 1: return int32_t(v[0] >> 32); case 2: return int32_t(v[1] >> 0); case 3: return int32_t(v[1] >> 32); } assert(0); return -1; } #endif __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } }; __m128 r[1]; static const float table[3] = { 1.0, 2.0, 4.0, }; static __m128 testee( const __m128 x) { const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127)); const __m128 s = _mm_or_ps( _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x), _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23))); const __m128 exp = _mm_cvtepi32_ps(iexp); const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f))); const __m128i rem = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot, _mm_set1_epi32(0x10003))); const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending on version of lane extractor used table[geti(rem, 0)], table[geti(rem, 1)], table[geti(rem, 2)], table[geti(rem, 3)]); return _mm_set1_ps(.5f) * entry; } int main(int argc, char** argv) { r[0] = testee(x[0]); return 0; } In the above function 'testee' (duly inlined in the disassembly below), local var 'rem' gets spilled and read back as scalars, depending on which version of the integer lane accessor was used. Output from clang 3.4 for target corei7-avx: $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */ $ objdump -dC --no-show-raw-insn ./a.out ... 00000000004004f0 <main>: 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x> 4004f8: vpsrld $0x17,%xmm0,%xmm0 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680 <__dso_handle+0x8> 400505: vcvtdq2ps %xmm0,%xmm1 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690 <__dso_handle+0x18> 400511: vcvttps2dq %xmm1,%xmm1 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0 <__dso_handle+0x28> 40051d: vpsubd %xmm1,%xmm0,%xmm0 400521: vmovq %xmm0,%rax 400526: movslq %eax,%rcx 400529: sar $0x20,%rax 40052d: vpextrq $0x1,%xmm0,%rdx 400533: movslq %edx,%rsi 400536: sar $0x20,%rdx 40053a: vmovss 0x4006c0(,%rcx,4),%xmm0 400543: vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0 40054e: vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0 400559: vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0 400564: vmulps 0x144(%rip),%xmm0,%xmm0 # 4006b0 <__dso_handle+0x38> 40056c: vmovaps %xmm0,0x20046c(%rip) # 6009e0 <r> 400574: xor %eax,%eax 400576: retq $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */ $ objdump -dC --no-show-raw-insn ./a.out ... 00000000004004f0 <main>: 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x> 4004f8: vpsrld $0x17,%xmm0,%xmm0 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680 <__dso_handle+0x8> 400505: vcvtdq2ps %xmm0,%xmm1 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690 <__dso_handle+0x18> 400511: vcvttps2dq %xmm1,%xmm1 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0 <__dso_handle+0x28> 40051d: vpsubd %xmm1,%xmm0,%xmm0 400521: vmovdqa %xmm0,-0x18(%rsp) 400527: movslq -0x18(%rsp),%rax 40052c: movslq -0x14(%rsp),%rcx 400531: movslq -0x10(%rsp),%rdx 400536: movslq -0xc(%rsp),%rsi 40053b: vmovss 0x4006c0(,%rax,4),%xmm0 400544: vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0 40054f: vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0 40055a: vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0 400565: vmulps 0x143(%rip),%xmm0,%xmm0 # 4006b0 <__dso_handle+0x38> 40056d: vmovaps %xmm0,0x20046b(%rip) # 6009e0 <r> 400575: xor %eax,%eax 400577: retq Output from clang pre-release 3.5 trunk for target btver1: $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */ $ objdump -dC --no-show-raw-insn ./a.out ... 00000000004005c0 <main>: 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x> 4005c8: psrld $0x17,%xmm0 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0> 4005d5: cvtdq2ps %xmm0,%xmm1 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1> 4005df: cvttps2dq %xmm1,%xmm1 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2> 4005eb: psubd %xmm1,%xmm0 4005ef: movq %xmm0,%rax 4005f4: movslq %eax,%rcx 4005f7: sar $0x20,%rax 4005fb: punpckhqdq %xmm0,%xmm0 4005ff: movq %xmm0,%rdx 400604: movslq %edx,%rsi 400607: sar $0x20,%rdx 40060b: movss 0x400740(,%rax,4),%xmm0 400614: movss 0x400740(,%rdx,4),%xmm1 40061d: unpcklps %xmm1,%xmm0 400620: movss 0x400740(,%rcx,4),%xmm1 400629: movss 0x400740(,%rsi,4),%xmm2 400632: unpcklps %xmm2,%xmm1 400635: unpcklps %xmm0,%xmm1 400638: mulps 0xf1(%rip),%xmm1 # 400730 <.LCPI0_3> 40063f: movaps %xmm1,0x1a1a(%rip) # 402060 <r> 400646: xor %eax,%eax 400648: retq $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */ $ objdump -dC --no-show-raw-insn ./a.out ... 00000000004005c0 <main>: 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x> 4005c8: psrld $0x17,%xmm0 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0> 4005d5: cvtdq2ps %xmm0,%xmm1 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1> 4005df: cvttps2dq %xmm1,%xmm1 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2> 4005eb: psubd %xmm1,%xmm0 4005ef: movdqa %xmm0,-0x18(%rsp) 4005f5: movslq -0x18(%rsp),%rax 4005fa: movslq -0x14(%rsp),%rcx 4005ff: movslq -0x10(%rsp),%rdx 400604: movslq -0xc(%rsp),%rsi 400609: movss 0x400740(,%rsi,4),%xmm0 400612: movss 0x400740(,%rcx,4),%xmm1 40061b: unpcklps %xmm0,%xmm1 40061e: movss 0x400740(,%rdx,4),%xmm0 400627: movss 0x400740(,%rax,4),%xmm2 400630: unpcklps %xmm0,%xmm2 400633: unpcklps %xmm1,%xmm2 400636: mulps 0xf3(%rip),%xmm2 # 400730 <.LCPI0_3> 40063d: movaps %xmm2,0x1a1c(%rip) # 402060 <r> 400644: xor %eax,%eax 400646: retq Is that behavior expected? Because I find it odd. Best regards, Martin -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141013/6ff9a867/attachment.html>
Quentin Colombet
2014-Oct-13 17:03 UTC
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
Hi Martin, I haven’t checked what is going on here, but if you believe some spill can be avoided, this is worth filing a PR (www.llvm.org/bugs) to libraries -> Register Allocator. Please attach the IR to reproduce the problem (-emit-llvm from clang). Thanks, -Quentin On Oct 13, 2014, at 9:13 AM, martin krastev <blu.dark at gmail.com> wrote:> Hello, > > Depending on how I extract integer lanes from an x86_64 xmm register, the backend may spill that register in order to load scalars. The effect was observed on two targets: corei7-avx and btver1 (I haven't checked other targets). > > Here's a test case with spilling/no-spilling code put on conditional compile: > > #if __SSE4_1__ != 0 > #include <smmintrin.h> > #else > #include <emmintrin.h> > #endif > #include <stdint.h> > #include <assert.h> > > #if SPILLING_ENSUES == 1 > static int32_t geti(const __m128i v, const size_t i) > { > switch (i) { > case 0: > return _mm_cvtsi128_si32(v); > case 1: > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5)); > case 2: > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6)); > case 3: > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7)); > } > > assert(0); > return -1; > } > > #else > static int32_t geti(const __m128i v, const size_t i) > { > switch (i) { > case 0: > return int32_t(v[0] >> 0); > case 1: > return int32_t(v[0] >> 32); > case 2: > return int32_t(v[1] >> 0); > case 3: > return int32_t(v[1] >> 32); > } > > assert(0); > return -1; > } > #endif > > __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } }; > __m128 r[1]; > > static const float table[3] = { > 1.0, > 2.0, > 4.0, > }; > > static __m128 testee( > const __m128 x) > { > const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127)); > const __m128 s = _mm_or_ps( > _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x), > _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23))); > > const __m128 exp = _mm_cvtepi32_ps(iexp); > const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f))); > const __m128i rem = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot, _mm_set1_epi32(0x10003))); > > const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending on version of lane extractor used > table[geti(rem, 0)], > table[geti(rem, 1)], > table[geti(rem, 2)], > table[geti(rem, 3)]); > > return _mm_set1_ps(.5f) * entry; > } > > int main(int argc, char** argv) > { > r[0] = testee(x[0]); > return 0; > } > > > In the above function 'testee' (duly inlined in the disassembly below), local var 'rem' gets spilled and read back as scalars, depending on which version of the integer lane accessor was used. > > Output from clang 3.4 for target corei7-avx: > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */ > $ objdump -dC --no-show-raw-insn ./a.out > ... > 00000000004004f0 <main>: > 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x> > 4004f8: vpsrld $0x17,%xmm0,%xmm0 > 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680 <__dso_handle+0x8> > 400505: vcvtdq2ps %xmm0,%xmm1 > 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690 <__dso_handle+0x18> > 400511: vcvttps2dq %xmm1,%xmm1 > 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0 <__dso_handle+0x28> > 40051d: vpsubd %xmm1,%xmm0,%xmm0 > 400521: vmovq %xmm0,%rax > 400526: movslq %eax,%rcx > 400529: sar $0x20,%rax > 40052d: vpextrq $0x1,%xmm0,%rdx > 400533: movslq %edx,%rsi > 400536: sar $0x20,%rdx > 40053a: vmovss 0x4006c0(,%rcx,4),%xmm0 > 400543: vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0 > 40054e: vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0 > 400559: vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0 > 400564: vmulps 0x144(%rip),%xmm0,%xmm0 # 4006b0 <__dso_handle+0x38> > 40056c: vmovaps %xmm0,0x20046c(%rip) # 6009e0 <r> > 400574: xor %eax,%eax > 400576: retq > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */ > $ objdump -dC --no-show-raw-insn ./a.out > ... > 00000000004004f0 <main>: > 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x> > 4004f8: vpsrld $0x17,%xmm0,%xmm0 > 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680 <__dso_handle+0x8> > 400505: vcvtdq2ps %xmm0,%xmm1 > 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690 <__dso_handle+0x18> > 400511: vcvttps2dq %xmm1,%xmm1 > 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0 <__dso_handle+0x28> > 40051d: vpsubd %xmm1,%xmm0,%xmm0 > 400521: vmovdqa %xmm0,-0x18(%rsp) > 400527: movslq -0x18(%rsp),%rax > 40052c: movslq -0x14(%rsp),%rcx > 400531: movslq -0x10(%rsp),%rdx > 400536: movslq -0xc(%rsp),%rsi > 40053b: vmovss 0x4006c0(,%rax,4),%xmm0 > 400544: vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0 > 40054f: vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0 > 40055a: vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0 > 400565: vmulps 0x143(%rip),%xmm0,%xmm0 # 4006b0 <__dso_handle+0x38> > 40056d: vmovaps %xmm0,0x20046b(%rip) # 6009e0 <r> > 400575: xor %eax,%eax > 400577: retq > > > Output from clang pre-release 3.5 trunk for target btver1: > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */ > $ objdump -dC --no-show-raw-insn ./a.out > ... > 00000000004005c0 <main>: > 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x> > 4005c8: psrld $0x17,%xmm0 > 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0> > 4005d5: cvtdq2ps %xmm0,%xmm1 > 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1> > 4005df: cvttps2dq %xmm1,%xmm1 > 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2> > 4005eb: psubd %xmm1,%xmm0 > 4005ef: movq %xmm0,%rax > 4005f4: movslq %eax,%rcx > 4005f7: sar $0x20,%rax > 4005fb: punpckhqdq %xmm0,%xmm0 > 4005ff: movq %xmm0,%rdx > 400604: movslq %edx,%rsi > 400607: sar $0x20,%rdx > 40060b: movss 0x400740(,%rax,4),%xmm0 > 400614: movss 0x400740(,%rdx,4),%xmm1 > 40061d: unpcklps %xmm1,%xmm0 > 400620: movss 0x400740(,%rcx,4),%xmm1 > 400629: movss 0x400740(,%rsi,4),%xmm2 > 400632: unpcklps %xmm2,%xmm1 > 400635: unpcklps %xmm0,%xmm1 > 400638: mulps 0xf1(%rip),%xmm1 # 400730 <.LCPI0_3> > 40063f: movaps %xmm1,0x1a1a(%rip) # 402060 <r> > 400646: xor %eax,%eax > 400648: retq > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */ > $ objdump -dC --no-show-raw-insn ./a.out > ... > 00000000004005c0 <main>: > 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x> > 4005c8: psrld $0x17,%xmm0 > 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0> > 4005d5: cvtdq2ps %xmm0,%xmm1 > 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1> > 4005df: cvttps2dq %xmm1,%xmm1 > 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2> > 4005eb: psubd %xmm1,%xmm0 > 4005ef: movdqa %xmm0,-0x18(%rsp) > 4005f5: movslq -0x18(%rsp),%rax > 4005fa: movslq -0x14(%rsp),%rcx > 4005ff: movslq -0x10(%rsp),%rdx > 400604: movslq -0xc(%rsp),%rsi > 400609: movss 0x400740(,%rsi,4),%xmm0 > 400612: movss 0x400740(,%rcx,4),%xmm1 > 40061b: unpcklps %xmm0,%xmm1 > 40061e: movss 0x400740(,%rdx,4),%xmm0 > 400627: movss 0x400740(,%rax,4),%xmm2 > 400630: unpcklps %xmm0,%xmm2 > 400633: unpcklps %xmm1,%xmm2 > 400636: mulps 0xf3(%rip),%xmm2 # 400730 <.LCPI0_3> > 40063d: movaps %xmm2,0x1a1c(%rip) # 402060 <r> > 400644: xor %eax,%eax > 400646: retq > > > Is that behavior expected? Because I find it odd. > > Best regards, > Martin > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
martin krastev
2014-Oct-14 08:36 UTC
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
Hi Quentin, Thank you for the directions. Here is the bug ticket: http://llvm.org/bugs/show_bug.cgi?id=21269 Best regards, Martin On Mon, Oct 13, 2014 at 8:03 PM, Quentin Colombet <qcolombet at apple.com> wrote:> Hi Martin, > > I haven’t checked what is going on here, but if you believe some spill can > be avoided, this is worth filing a PR (www.llvm.org/bugs) to libraries -> > Register Allocator. > Please attach the IR to reproduce the problem (-emit-llvm from clang). > > Thanks, > -Quentin > > On Oct 13, 2014, at 9:13 AM, martin krastev <blu.dark at gmail.com> wrote: > > > Hello, > > > > Depending on how I extract integer lanes from an x86_64 xmm register, > the backend may spill that register in order to load scalars. The effect > was observed on two targets: corei7-avx and btver1 (I haven't checked other > targets). > > > > Here's a test case with spilling/no-spilling code put on conditional > compile: > > > > #if __SSE4_1__ != 0 > > #include <smmintrin.h> > > #else > > #include <emmintrin.h> > > #endif > > #include <stdint.h> > > #include <assert.h> > > > > #if SPILLING_ENSUES == 1 > > static int32_t geti(const __m128i v, const size_t i) > > { > > switch (i) { > > case 0: > > return _mm_cvtsi128_si32(v); > > case 1: > > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5)); > > case 2: > > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6)); > > case 3: > > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7)); > > } > > > > assert(0); > > return -1; > > } > > > > #else > > static int32_t geti(const __m128i v, const size_t i) > > { > > switch (i) { > > case 0: > > return int32_t(v[0] >> 0); > > case 1: > > return int32_t(v[0] >> 32); > > case 2: > > return int32_t(v[1] >> 0); > > case 3: > > return int32_t(v[1] >> 32); > > } > > > > assert(0); > > return -1; > > } > > #endif > > > > __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } }; > > __m128 r[1]; > > > > static const float table[3] = { > > 1.0, > > 2.0, > > 4.0, > > }; > > > > static __m128 testee( > > const __m128 x) > > { > > const __m128i iexp > _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127)); > > const __m128 s = _mm_or_ps( > > _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << > 23)), x), > > _mm_castsi128_ps(_mm_set1_epi32(0x7f << > 23))); > > > > const __m128 exp = _mm_cvtepi32_ps(iexp); > > const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, > _mm_set1_ps(3.f))); > > const __m128i rem = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot, > _mm_set1_epi32(0x10003))); > > > > const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending > on version of lane extractor used > > table[geti(rem, 0)], > > table[geti(rem, 1)], > > table[geti(rem, 2)], > > table[geti(rem, 3)]); > > > > return _mm_set1_ps(.5f) * entry; > > } > > > > int main(int argc, char** argv) > > { > > r[0] = testee(x[0]); > > return 0; > > } > > > > > > In the above function 'testee' (duly inlined in the disassembly below), > local var 'rem' gets spilled and read back as scalars, depending on which > version of the integer lane accessor was used. > > > > Output from clang 3.4 for target corei7-avx: > > > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math > -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */ > > $ objdump -dC --no-show-raw-insn ./a.out > > ... > > 00000000004004f0 <main>: > > 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x> > > 4004f8: vpsrld $0x17,%xmm0,%xmm0 > > 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680 > <__dso_handle+0x8> > > 400505: vcvtdq2ps %xmm0,%xmm1 > > 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690 > <__dso_handle+0x18> > > 400511: vcvttps2dq %xmm1,%xmm1 > > 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0 > <__dso_handle+0x28> > > 40051d: vpsubd %xmm1,%xmm0,%xmm0 > > 400521: vmovq %xmm0,%rax > > 400526: movslq %eax,%rcx > > 400529: sar $0x20,%rax > > 40052d: vpextrq $0x1,%xmm0,%rdx > > 400533: movslq %edx,%rsi > > 400536: sar $0x20,%rdx > > 40053a: vmovss 0x4006c0(,%rcx,4),%xmm0 > > 400543: vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0 > > 40054e: vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0 > > 400559: vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0 > > 400564: vmulps 0x144(%rip),%xmm0,%xmm0 # 4006b0 > <__dso_handle+0x38> > > 40056c: vmovaps %xmm0,0x20046c(%rip) # 6009e0 <r> > > 400574: xor %eax,%eax > > 400576: retq > > > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math > -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */ > > $ objdump -dC --no-show-raw-insn ./a.out > > ... > > 00000000004004f0 <main>: > > 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x> > > 4004f8: vpsrld $0x17,%xmm0,%xmm0 > > 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680 > <__dso_handle+0x8> > > 400505: vcvtdq2ps %xmm0,%xmm1 > > 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690 > <__dso_handle+0x18> > > 400511: vcvttps2dq %xmm1,%xmm1 > > 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0 > <__dso_handle+0x28> > > 40051d: vpsubd %xmm1,%xmm0,%xmm0 > > 400521: vmovdqa %xmm0,-0x18(%rsp) > > 400527: movslq -0x18(%rsp),%rax > > 40052c: movslq -0x14(%rsp),%rcx > > 400531: movslq -0x10(%rsp),%rdx > > 400536: movslq -0xc(%rsp),%rsi > > 40053b: vmovss 0x4006c0(,%rax,4),%xmm0 > > 400544: vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0 > > 40054f: vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0 > > 40055a: vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0 > > 400565: vmulps 0x143(%rip),%xmm0,%xmm0 # 4006b0 > <__dso_handle+0x38> > > 40056d: vmovaps %xmm0,0x20046b(%rip) # 6009e0 <r> > > 400575: xor %eax,%eax > > 400577: retq > > > > > > Output from clang pre-release 3.5 trunk for target btver1: > > > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math > -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */ > > $ objdump -dC --no-show-raw-insn ./a.out > > ... > > 00000000004005c0 <main>: > > 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x> > > 4005c8: psrld $0x17,%xmm0 > > 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0> > > 4005d5: cvtdq2ps %xmm0,%xmm1 > > 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1> > > 4005df: cvttps2dq %xmm1,%xmm1 > > 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2> > > 4005eb: psubd %xmm1,%xmm0 > > 4005ef: movq %xmm0,%rax > > 4005f4: movslq %eax,%rcx > > 4005f7: sar $0x20,%rax > > 4005fb: punpckhqdq %xmm0,%xmm0 > > 4005ff: movq %xmm0,%rdx > > 400604: movslq %edx,%rsi > > 400607: sar $0x20,%rdx > > 40060b: movss 0x400740(,%rax,4),%xmm0 > > 400614: movss 0x400740(,%rdx,4),%xmm1 > > 40061d: unpcklps %xmm1,%xmm0 > > 400620: movss 0x400740(,%rcx,4),%xmm1 > > 400629: movss 0x400740(,%rsi,4),%xmm2 > > 400632: unpcklps %xmm2,%xmm1 > > 400635: unpcklps %xmm0,%xmm1 > > 400638: mulps 0xf1(%rip),%xmm1 # 400730 <.LCPI0_3> > > 40063f: movaps %xmm1,0x1a1a(%rip) # 402060 <r> > > 400646: xor %eax,%eax > > 400648: retq > > > > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math > -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */ > > $ objdump -dC --no-show-raw-insn ./a.out > > ... > > 00000000004005c0 <main>: > > 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x> > > 4005c8: psrld $0x17,%xmm0 > > 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0> > > 4005d5: cvtdq2ps %xmm0,%xmm1 > > 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1> > > 4005df: cvttps2dq %xmm1,%xmm1 > > 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2> > > 4005eb: psubd %xmm1,%xmm0 > > 4005ef: movdqa %xmm0,-0x18(%rsp) > > 4005f5: movslq -0x18(%rsp),%rax > > 4005fa: movslq -0x14(%rsp),%rcx > > 4005ff: movslq -0x10(%rsp),%rdx > > 400604: movslq -0xc(%rsp),%rsi > > 400609: movss 0x400740(,%rsi,4),%xmm0 > > 400612: movss 0x400740(,%rcx,4),%xmm1 > > 40061b: unpcklps %xmm0,%xmm1 > > 40061e: movss 0x400740(,%rdx,4),%xmm0 > > 400627: movss 0x400740(,%rax,4),%xmm2 > > 400630: unpcklps %xmm0,%xmm2 > > 400633: unpcklps %xmm1,%xmm2 > > 400636: mulps 0xf3(%rip),%xmm2 # 400730 <.LCPI0_3> > > 40063d: movaps %xmm2,0x1a1c(%rip) # 402060 <r> > > 400644: xor %eax,%eax > > 400646: retq > > > > > > Is that behavior expected? Because I find it odd. > > > > Best regards, > > Martin > > _______________________________________________ > > LLVM Developers mailing list > > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141014/e7b43539/attachment.html>
Seemingly Similar Threads
- [LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
- [LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
- [LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
- [LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
- [LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!