Sebastien DELDON-GNB
2013-Feb-19 09:52 UTC
[LLVMdev] Is it a bug or am I missing something ?
Hi all, on following code: ; ModuleID = 'shufxbug.ll' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" target triple = "i386-pc-linux-gnu" define void @sample_test(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline { L.entry: %0 = getelementptr <4 x float>* %source, i32 19 %1 = load <4 x float>* %0, align 16 %2 = extractelement <4 x float> %1, i32 0 %3 = insertelement <8 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %2, i32 2 %4 = insertelement <8 x float> %3, float %2, i32 1 %5 = getelementptr <8 x float>* %dest, i32 19 store <8 x float> %4, <8 x float>* %5, align 4 ret void } I'm expecting LLVM to generate code so that at vecor is stored at dest[19] with following value: <float 0.000000e+00, float 'elem_0_of_source' , float 'elem_0_of_source', float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00> When I use llc trunk as follows on a Corei7 machine I've got following assembly code: llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s .file "shufxbug.ll" .text .globl sample_test .align 16, 0x90 .type sample_test, at function sample_test: # @sample_test # BB#0: # %L.entry movl 4(%esp), %eax movss 304(%eax), %xmm0 xorps %xmm1, %xmm1 movl 8(%esp), %eax movups %xmm1, 624(%eax) pshufd $65, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,1] movdqu %xmm0, 608(%eax) ret .Ltmp0: .size sample_test, .Ltmp0-sample_test .section ".note.GNU-stack","", at progbits It seems to me that this sequence of instruction is building vector: <float 'elem_1_of_source', float 'elem_0_of_source' , float 'elem_0_of_source', float 'elem_1_of_source', float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00> On a sandy bridge system, I've got similar behavior with a slightly different code (using AVX): pushl %ebp .Ltmp5: .cfi_def_cfa_offset 8 .Ltmp6: .cfi_offset %ebp, -8 movl %esp, %ebp .Ltmp7: .cfi_def_cfa_register %ebp movl 12(%ebp), %eax .loc 1 9 0 prologue_end # shufxbug.cl:9:0 .Ltmp8: vpermilps $65, 304(%eax), %xmm0 # xmm0 = mem[1,0,0,1] vxorps %xmm1, %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 movl 16(%ebp), %eax .loc 1 10 0 # shufxbug.cl:10:0 vmovups %ymm0, 608(%eax) .loc 1 11 0 # shufxbug.cl:11:0 popl %ebp vzeroupper ret It seems to me that generated code is not correct in both case can someone confirm or indicate what I did wrong if not a bug ? If this ends up being an actual BUG I'll submit it in BUG tracking system. Thanks Seb
<<<<<<<<<<<<<<<<<<<<<<<<<< ; ModuleID = 'shufxbug.ll' target datalayout "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:6 4-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" target triple = "i386-pc-linux-gnu" define void @sample_test(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline { L.entry: %0 = getelementptr <4 x float>* %source, i32 19 %1 = load <4 x float>* %0, align 16 %2 = extractelement <4 x float> %1, i32 0 %3 = insertelement <8 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %2, i32 2 %4 = insertelement <8 x float> %3, float %2, i32 1 %5 = getelementptr <8 x float>* %dest, i32 19 store <8 x float> %4, <8 x float>* %5, align 4 ret void } I'm expecting LLVM to generate code so that at vecor is stored at dest[19] with following value: <float 0.000000e+00, float 'elem_0_of_source' , float 'elem_0_of_source', float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00> When I use llc trunk as follows on a Corei7 machine I've got following assembly code: llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s .file "shufxbug.ll" .text .globl sample_test .align 16, 0x90 .type sample_test, at function sample_test: # @sample_test # BB#0: # %L.entry movl 4(%esp), %eax movss 304(%eax), %xmm0 xorps %xmm1, %xmm1 movl 8(%esp), %eax movups %xmm1, 624(%eax) pshufd $65, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,1] movdqu %xmm0, 608(%eax) ret .Ltmp0: .size sample_test, .Ltmp0-sample_test .section ".note.GNU-stack","", at progbits It seems to me that this sequence of instruction is building vector: <float 'elem_1_of_source', float 'elem_0_of_source' , float 'elem_0_of_source', float 'elem_1_of_source', float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00> <<<<<<<<<<<<<<<<<<<<<<<< Umm, isn't the movss (not movps) instruction just loading the bottom lane of the XMM register, so that IF (presumably due to calling convention) we know xmm0 is already zeroed this is constructing what's desired? If you print the output, is it actually not what the IR leads you to expect? Cheers, Dave
Sebastien DELDON-GNB
2013-Feb-19 10:25 UTC
[LLVMdev] Is it a bug or am I missing something ?
Hi David, Thanks for the quick reply. Indeed my problem is that on Core i7 it works whereas on Sandy bridge it fails. Given you explanation on movss instruction you're correct: MOVSS will initialize XMM0 with elem 0 of source and clear bits 32 to 127 and thus it explains why it code is correct on Core i7. Now looking at Sandy bridge code. XMM0 is initialiazed using vpermilps instruction as follows: vpermilps $65, 304(%eax), %xmm0 # xmm0 = mem[1,0,0,1] xmm0 is initialized with <elem 1 of source, elem 0 of source, elem 0 of source , elem 1 of source>, which is not what's expected. I guess this is a bug in sandy bridge code generation. Can someone confirm ? Best Regards Seb> -----Original Message----- > From: David Tweed [mailto:david.tweed at arm.com] > Sent: Tuesday, February 19, 2013 11:08 AM > To: Sebastien DELDON-GNB; LLVMdev at cs.uiuc.edu > Subject: RE: Is it a bug or am I missing something ? > > <<<<<<<<<<<<<<<<<<<<<<<<<< > ; ModuleID = 'shufxbug.ll' > target datalayout > "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:6 > 4-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" > target triple = "i386-pc-linux-gnu" > > define void @sample_test(<4 x float>* nocapture %source, <8 x float>* > nocapture %dest) nounwind noinline { > L.entry: > %0 = getelementptr <4 x float>* %source, i32 19 > %1 = load <4 x float>* %0, align 16 > %2 = extractelement <4 x float> %1, i32 0 > %3 = insertelement <8 x float> <float 0.000000e+00, float undef, float > undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float > 0.000000e+00, float 0.000000e+00>, float %2, i32 2 > %4 = insertelement <8 x float> %3, float %2, i32 1 > %5 = getelementptr <8 x float>* %dest, i32 19 > store <8 x float> %4, <8 x float>* %5, align 4 > ret void > } > > > I'm expecting LLVM to generate code so that at vecor is stored at dest[19] > with following value: > > <float 0.000000e+00, float 'elem_0_of_source' , float 'elem_0_of_source', > float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float > 0.000000e+00, float 0.000000e+00> > > When I use llc trunk as follows on a Corei7 machine I've got following > assembly code: > > llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s > > > .file "shufxbug.ll" > .text > .globl sample_test > .align 16, 0x90 > .type sample_test, at function > sample_test: # @sample_test > # BB#0: # %L.entry > movl 4(%esp), %eax > movss 304(%eax), %xmm0 > xorps %xmm1, %xmm1 > movl 8(%esp), %eax > movups %xmm1, 624(%eax) > pshufd $65, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,1] > movdqu %xmm0, 608(%eax) > ret > .Ltmp0: > .size sample_test, .Ltmp0-sample_test > > > .section ".note.GNU-stack","", at progbits > > > It seems to me that this sequence of instruction is building vector: > > > <float 'elem_1_of_source', float 'elem_0_of_source' , float > 'elem_0_of_source', float 'elem_1_of_source', float 0.000000e+00, float > 0.000000e+00, float 0.000000e+00, float 0.000000e+00> > > <<<<<<<<<<<<<<<<<<<<<<<< > > Umm, isn't the movss (not movps) instruction just loading the bottom lane of > the XMM register, so that IF (presumably due to calling convention) we know > xmm0 is already zeroed this is constructing what's desired? If you print the > output, is it actually not what the IR leads you to expect? > > Cheers, > Dave > > >
Apparently Analagous Threads
- [LLVMdev] Is it a bug or am I missing something ?
- [LLVMdev] "equivalent" .ll files diverge after optimizations are applied
- [LLVMdev] "equivalent" .ll files diverge after optimizations are applied
- [LLVMdev] "equivalent" .ll files diverge after optimizations are applied
- [LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!