thr3ads.net - llvm dev - [LLVMdev] Is it a bug or am I missing something ? [Feb 2013]

If this information is useful, please help other people find it:
Share via:

Sebastien DELDON-GNB

2013-Feb-19 09:52 UTC

[LLVMdev] Is it a bug or am I missing something ?

Hi all,

on following code:


; ModuleID = 'shufxbug.ll'
target datalayout =
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i386-pc-linux-gnu"

define void @sample_test(<4 x float>* nocapture %source, <8 x
float>* nocapture %dest) nounwind noinline {
L.entry:
  %0 = getelementptr <4 x float>* %source, i32 19
  %1 = load <4 x float>* %0, align 16
  %2 = extractelement <4 x float> %1, i32 0
  %3 = insertelement <8 x float> <float 0.000000e+00, float undef,
float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float
0.000000e+00, float 0.000000e+00>, float %2, i32 2
  %4 = insertelement <8 x float> %3, float %2, i32 1
  %5 = getelementptr <8 x float>* %dest, i32 19
  store <8 x float> %4, <8 x float>* %5, align 4
  ret void
}


I'm expecting LLVM to generate code so that at vecor is stored at dest[19]
with following value:

<float 0.000000e+00, float 'elem_0_of_source' , float
'elem_0_of_source', float 0.000000e+00, float 0.000000e+00, float
0.000000e+00, float 0.000000e+00, float 0.000000e+00>

When I use llc trunk as follows on a Corei7 machine I've got following
assembly code:

llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s


    .file   "shufxbug.ll"
    .text
    .globl  sample_test
    .align  16, 0x90
    .type   sample_test, at function
sample_test:                            # @sample_test
# BB#0:                                 # %L.entry
    movl    4(%esp), %eax 
    movss   304(%eax), %xmm0
    xorps   %xmm1, %xmm1
    movl    8(%esp), %eax
    movups  %xmm1, 624(%eax)
    pshufd  $65, %xmm0, %xmm0       # xmm0 = xmm0[1,0,0,1]
    movdqu  %xmm0, 608(%eax)
    ret
.Ltmp0:
    .size   sample_test, .Ltmp0-sample_test
    
    
    .section    ".note.GNU-stack","", at progbits


It seems to me that this sequence of instruction is building vector:


<float 'elem_1_of_source', float 'elem_0_of_source' , float
'elem_0_of_source', float 'elem_1_of_source', float
0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>

On a sandy bridge system, I've got similar behavior with a slightly
different code (using AVX):

    pushl   %ebp
.Ltmp5:
    .cfi_def_cfa_offset 8
.Ltmp6:
    .cfi_offset %ebp, -8
    movl    %esp, %ebp
.Ltmp7:
    .cfi_def_cfa_register %ebp
    movl    12(%ebp), %eax
    .loc    1 9 0 prologue_end      # shufxbug.cl:9:0
.Ltmp8:
    vpermilps   $65, 304(%eax), %xmm0 # xmm0 = mem[1,0,0,1]
    vxorps  %xmm1, %xmm1, %xmm1
    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    movl    16(%ebp), %eax
    .loc    1 10 0                  # shufxbug.cl:10:0
    vmovups %ymm0, 608(%eax)
    .loc    1 11 0                  # shufxbug.cl:11:0
    popl    %ebp
    vzeroupper
    ret

It seems to me that generated code is not correct in both case can someone
confirm or indicate what I did wrong if not a bug ?
If this ends up being an actual BUG I'll submit it in BUG tracking system.

Thanks
Seb

David Tweed

2013-Feb-19 10:08 UTC

head link

[LLVMdev] Is it a bug or am I missing something ?

<<<<<<<<<<<<<<<<<<<<<<<<<<
; ModuleID = 'shufxbug.ll'
target datalayout
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:6
4-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i386-pc-linux-gnu"

define void @sample_test(<4 x float>* nocapture %source, <8 x
float>*
nocapture %dest) nounwind noinline {
L.entry:
  %0 = getelementptr <4 x float>* %source, i32 19
  %1 = load <4 x float>* %0, align 16
  %2 = extractelement <4 x float> %1, i32 0
  %3 = insertelement <8 x float> <float 0.000000e+00, float undef,
float
undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float
0.000000e+00, float 0.000000e+00>, float %2, i32 2
  %4 = insertelement <8 x float> %3, float %2, i32 1
  %5 = getelementptr <8 x float>* %dest, i32 19
  store <8 x float> %4, <8 x float>* %5, align 4
  ret void
}


I'm expecting LLVM to generate code so that at vecor is stored at dest[19]
with following value:

<float 0.000000e+00, float 'elem_0_of_source' , float
'elem_0_of_source',
float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float
0.000000e+00, float 0.000000e+00>

When I use llc trunk as follows on a Corei7 machine I've got following
assembly code:

llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s


    .file   "shufxbug.ll"
    .text
    .globl  sample_test
    .align  16, 0x90
    .type   sample_test, at function
sample_test:                            # @sample_test
# BB#0:                                 # %L.entry
    movl    4(%esp), %eax 
    movss   304(%eax), %xmm0
    xorps   %xmm1, %xmm1
    movl    8(%esp), %eax
    movups  %xmm1, 624(%eax)
    pshufd  $65, %xmm0, %xmm0       # xmm0 = xmm0[1,0,0,1]
    movdqu  %xmm0, 608(%eax)
    ret
.Ltmp0:
    .size   sample_test, .Ltmp0-sample_test
    
    
    .section    ".note.GNU-stack","", at progbits


It seems to me that this sequence of instruction is building vector:


<float 'elem_1_of_source', float 'elem_0_of_source' , float
'elem_0_of_source', float 'elem_1_of_source', float
0.000000e+00, float
0.000000e+00, float 0.000000e+00, float 0.000000e+00>

<<<<<<<<<<<<<<<<<<<<<<<<

Umm, isn't the movss (not movps) instruction just loading the bottom lane of
the XMM register, so that IF (presumably due to calling convention) we know
xmm0 is already zeroed this is constructing what's desired? If you print the
output, is it actually not what the IR leads you to expect?

Cheers,
Dave

Sebastien DELDON-GNB

2013-Feb-19 10:25 UTC

head link

[LLVMdev] Is it a bug or am I missing something ?

Hi David,

Thanks for the quick reply. Indeed my problem is that on Core i7 it works
whereas on Sandy bridge it fails. Given you explanation on movss instruction
you're correct:

MOVSS will initialize XMM0 with elem 0 of source and clear bits 32 to 127 and
thus it explains why it code is correct on Core i7. Now looking at Sandy bridge
code. XMM0 is initialiazed using vpermilps instruction as follows:

vpermilps   $65, 304(%eax), %xmm0 # xmm0 = mem[1,0,0,1]

xmm0 is initialized with <elem 1 of source, elem 0 of source, elem 0 of
source , elem 1 of source>, which is not what's expected.
I guess this is a bug in sandy bridge code generation.
Can someone confirm ?

Best Regards
Seb

> -----Original Message-----
> From: David Tweed [mailto:david.tweed at arm.com]
> Sent: Tuesday, February 19, 2013 11:08 AM
> To: Sebastien DELDON-GNB; LLVMdev at cs.uiuc.edu
> Subject: RE: Is it a bug or am I missing something ?
> 
>
<<<<<<<<<<<<<<<<<<<<<<<<<<
> ; ModuleID = 'shufxbug.ll'
> target datalayout >
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:6
> 4-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
> target triple = "i386-pc-linux-gnu"
> 
> define void @sample_test(<4 x float>* nocapture %source, <8 x
float>*
> nocapture %dest) nounwind noinline {
> L.entry:
>   %0 = getelementptr <4 x float>* %source, i32 19
>   %1 = load <4 x float>* %0, align 16
>   %2 = extractelement <4 x float> %1, i32 0
>   %3 = insertelement <8 x float> <float 0.000000e+00, float undef,
float
> undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float
> 0.000000e+00, float 0.000000e+00>, float %2, i32 2
>   %4 = insertelement <8 x float> %3, float %2, i32 1
>   %5 = getelementptr <8 x float>* %dest, i32 19
>   store <8 x float> %4, <8 x float>* %5, align 4
>   ret void
> }
> 
> 
> I'm expecting LLVM to generate code so that at vecor is stored at
dest[19]
> with following value:
> 
> <float 0.000000e+00, float 'elem_0_of_source' , float
'elem_0_of_source',
> float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float
> 0.000000e+00, float 0.000000e+00>
> 
> When I use llc trunk as follows on a Corei7 machine I've got following
> assembly code:
> 
> llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s
> 
> 
>     .file   "shufxbug.ll"
>     .text
>     .globl  sample_test
>     .align  16, 0x90
>     .type   sample_test, at function
> sample_test:                            # @sample_test
> # BB#0:                                 # %L.entry
>     movl    4(%esp), %eax
>     movss   304(%eax), %xmm0
>     xorps   %xmm1, %xmm1
>     movl    8(%esp), %eax
>     movups  %xmm1, 624(%eax)
>     pshufd  $65, %xmm0, %xmm0       # xmm0 = xmm0[1,0,0,1]
>     movdqu  %xmm0, 608(%eax)
>     ret
> .Ltmp0:
>     .size   sample_test, .Ltmp0-sample_test
> 
> 
>     .section    ".note.GNU-stack","", at progbits
> 
> 
> It seems to me that this sequence of instruction is building vector:
> 
> 
> <float 'elem_1_of_source', float 'elem_0_of_source' ,
float
> 'elem_0_of_source', float 'elem_1_of_source', float
0.000000e+00, float
> 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
> 
>
<<<<<<<<<<<<<<<<<<<<<<<<
> 
> Umm, isn't the movss (not movps) instruction just loading the bottom
lane of
> the XMM register, so that IF (presumably due to calling convention) we know
> xmm0 is already zeroed this is constructing what's desired? If you
print the
> output, is it actually not what the IR leads you to expect?
> 
> Cheers,
> Dave
> 
> 
>

Reasonably Related Threads

Search for more apparently analagous threads

llvm dev - Feb 2013 - [LLVMdev] Is it a bug or am I missing something ?

[LLVMdev] Is it a bug or am I missing something ?

[LLVMdev] Is it a bug or am I missing something ?

[LLVMdev] Is it a bug or am I missing something ?

Reasonably Related Threads