Hi,
I have run into the following strange behavior and wanted to ask for
some advice. For the C program below, function sum() gets inlined in
foo() but the code generated looks very suboptimal (the code is an
extract from a larger program).
Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A"). As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another). After that, it proceeds to add the numbers up.
But why not add the numbers into 1 register directly? Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects. I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.
It's all very strange and it points to some questionable decision
making on the part of llvm. I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.
I would appreciate some advice on how to get around this problem.
Thank you for any help,
Brent
double sum( double* v, int v_siz )
{
double sum = 0.0;
int i = 0;
for (; i != v_siz; ++i)
sum += v[i];
return sum;
}
double foo(double *x, int *p, int k)
{
double s = 0.0;
for (int i = 0; i != k;++i)
{
s += sum(&x[i], 18);
p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
}
return s;
}
====== Output A ==========================foo:
# @foo
.Ltmp12:
.cfi_startproc
# BB#0:
pushl %ebx
.Ltmp13:
.cfi_def_cfa_offset 8
pushl %edi
.Ltmp14:
.cfi_def_cfa_offset 12
pushl %esi
.Ltmp15:
.cfi_def_cfa_offset 16
subl $88, %esp
.Ltmp16:
.cfi_def_cfa_offset 104
.Ltmp17:
.cfi_offset %esi, -16
.Ltmp18:
.cfi_offset %edi, -12
.Ltmp19:
.cfi_offset %ebx, -8
pxor %xmm0, %xmm0
movl 112(%esp), %eax
testl %eax, %eax
je .LBB1_3
# BB#1:
xorl %ebx, %ebx
movl 108(%esp), %ecx
movl 104(%esp), %edx
xorl %esi, %esi
.align 16, 0x90
.LBB1_2: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%edx,%ebx,8), %xmm2
addsd .LCPI1_0, %xmm2
movsd 16(%edx,%ebx,8), %xmm1
movsd %xmm1, (%esp) # 8-byte Spill
movl %ebx, %edi
addl $1, %edi
addsd (%edx,%edi,8), %xmm2
movsd 136(%edx,%ebx,8), %xmm1
movsd %xmm1, 72(%esp) # 8-byte Spill
movsd 128(%edx,%ebx,8), %xmm1
movsd %xmm1, 64(%esp) # 8-byte Spill
movsd 120(%edx,%ebx,8), %xmm1
movsd %xmm1, 56(%esp) # 8-byte Spill
movsd 112(%edx,%ebx,8), %xmm1
movsd %xmm1, 48(%esp) # 8-byte Spill
movsd 104(%edx,%ebx,8), %xmm1
movsd %xmm1, 40(%esp) # 8-byte Spill
movsd 96(%edx,%ebx,8), %xmm1
movsd %xmm1, 32(%esp) # 8-byte Spill
movsd 88(%edx,%ebx,8), %xmm1
movsd %xmm1, 24(%esp) # 8-byte Spill
movsd 80(%edx,%ebx,8), %xmm1
movsd %xmm1, 16(%esp) # 8-byte Spill
movsd 72(%edx,%ebx,8), %xmm1
movsd %xmm1, 8(%esp) # 8-byte Spill
movsd 64(%edx,%ebx,8), %xmm7
movsd 56(%edx,%ebx,8), %xmm1
movsd 48(%edx,%ebx,8), %xmm3
movsd 40(%edx,%ebx,8), %xmm4
movsd 32(%edx,%ebx,8), %xmm5
movsd 24(%edx,%ebx,8), %xmm6
movl $5, (%ecx,%ebx,4)
addsd (%esp), %xmm2 # 8-byte Folded Reload
addsd %xmm6, %xmm2
addsd %xmm5, %xmm2
addsd %xmm4, %xmm2
addsd %xmm3, %xmm2
addsd %xmm1, %xmm2
addsd %xmm7, %xmm2
addsd 8(%esp), %xmm2 # 8-byte Folded Reload
addsd 16(%esp), %xmm2 # 8-byte Folded Reload
addsd 24(%esp), %xmm2 # 8-byte Folded Reload
addsd 32(%esp), %xmm2 # 8-byte Folded Reload
addsd 40(%esp), %xmm2 # 8-byte Folded Reload
addsd 48(%esp), %xmm2 # 8-byte Folded Reload
addsd 56(%esp), %xmm2 # 8-byte Folded Reload
addsd 64(%esp), %xmm2 # 8-byte Folded Reload
addsd 72(%esp), %xmm2 # 8-byte Folded Reload
addsd %xmm2, %xmm0
adcl $0, %esi
cmpl %eax, %edi
movl %edi, %ebx
jne .LBB1_2
.LBB1_3: # %._crit_edge
movsd %xmm0, 80(%esp)
fldl 80(%esp)
addl $88, %esp
popl %esi
popl %edi
popl %ebx
ret
.Ltmp20:
.size foo, .Ltmp20-foo
.Ltmp21:
.cfi_endproc
.Leh_func_end1:
====== Output B ==========================
foo: # @foo
.Ltmp11:
.cfi_startproc
# BB#0:
pushl %edi
.Ltmp12:
.cfi_def_cfa_offset 8
pushl %esi
.Ltmp13:
.cfi_def_cfa_offset 12
subl $12, %esp
.Ltmp14:
.cfi_def_cfa_offset 24
.Ltmp15:
.cfi_offset %esi, -12
.Ltmp16:
.cfi_offset %edi, -8
pxor %xmm0, %xmm0
movl 32(%esp), %eax
testl %eax, %eax
je .LBB1_3
# BB#1:
xorl %esi, %esi
movl 24(%esp), %ecx
pxor %xmm1, %xmm1
xorl %edx, %edx
.align 16, 0x90
.LBB1_2: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%ecx,%esi,8), %xmm2
addsd %xmm1, %xmm2
movl %esi, %edi
addl $1, %edi
addsd (%ecx,%edi,8), %xmm2
addsd 16(%ecx,%esi,8), %xmm2
addsd 24(%ecx,%esi,8), %xmm2
addsd 32(%ecx,%esi,8), %xmm2
addsd 40(%ecx,%esi,8), %xmm2
addsd 48(%ecx,%esi,8), %xmm2
addsd 56(%ecx,%esi,8), %xmm2
addsd 64(%ecx,%esi,8), %xmm2
addsd 72(%ecx,%esi,8), %xmm2
addsd 80(%ecx,%esi,8), %xmm2
addsd 88(%ecx,%esi,8), %xmm2
addsd 96(%ecx,%esi,8), %xmm2
addsd 104(%ecx,%esi,8), %xmm2
addsd 112(%ecx,%esi,8), %xmm2
addsd 120(%ecx,%esi,8), %xmm2
addsd 128(%ecx,%esi,8), %xmm2
addsd 136(%ecx,%esi,8), %xmm2
addsd %xmm2, %xmm0
adcl $0, %edx
cmpl %eax, %edi
movl %edi, %esi
jne .LBB1_2
.LBB1_3: # %._crit_edge
movsd %xmm0, (%esp)
fldl (%esp)
addl $12, %esp
popl %esi
popl %edi
ret
Patrik Hägglund H
2012-Apr-05 10:36 UTC
[LLVMdev] Suboptimal code due to excessive spilling
I don't know much about this, but maybe -mllvm -unroll-count=1 can be used
as a workaround?
/Patrik Hägglund
-----Original Message-----
From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On
Behalf Of Brent Walker
Sent: den 28 mars 2012 03:18
To: llvmdev
Subject: [LLVMdev] Suboptimal code due to excessive spilling
Hi,
I have run into the following strange behavior and wanted to ask for
some advice. For the C program below, function sum() gets inlined in
foo() but the code generated looks very suboptimal (the code is an
extract from a larger program).
Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A"). As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another). After that, it proceeds to add the numbers up.
But why not add the numbers into 1 register directly? Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects. I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.
It's all very strange and it points to some questionable decision
making on the part of llvm. I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.
I would appreciate some advice on how to get around this problem.
Thank you for any help,
Brent
double sum( double* v, int v_siz )
{
double sum = 0.0;
int i = 0;
for (; i != v_siz; ++i)
sum += v[i];
return sum;
}
double foo(double *x, int *p, int k)
{
double s = 0.0;
for (int i = 0; i != k;++i)
{
s += sum(&x[i], 18);
p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
}
return s;
}
====== Output A ==========================foo:
# @foo
.Ltmp12:
.cfi_startproc
# BB#0:
pushl %ebx
.Ltmp13:
.cfi_def_cfa_offset 8
pushl %edi
.Ltmp14:
.cfi_def_cfa_offset 12
pushl %esi
.Ltmp15:
.cfi_def_cfa_offset 16
subl $88, %esp
.Ltmp16:
.cfi_def_cfa_offset 104
.Ltmp17:
.cfi_offset %esi, -16
.Ltmp18:
.cfi_offset %edi, -12
.Ltmp19:
.cfi_offset %ebx, -8
pxor %xmm0, %xmm0
movl 112(%esp), %eax
testl %eax, %eax
je .LBB1_3
# BB#1:
xorl %ebx, %ebx
movl 108(%esp), %ecx
movl 104(%esp), %edx
xorl %esi, %esi
.align 16, 0x90
.LBB1_2: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%edx,%ebx,8), %xmm2
addsd .LCPI1_0, %xmm2
movsd 16(%edx,%ebx,8), %xmm1
movsd %xmm1, (%esp) # 8-byte Spill
movl %ebx, %edi
addl $1, %edi
addsd (%edx,%edi,8), %xmm2
movsd 136(%edx,%ebx,8), %xmm1
movsd %xmm1, 72(%esp) # 8-byte Spill
movsd 128(%edx,%ebx,8), %xmm1
movsd %xmm1, 64(%esp) # 8-byte Spill
movsd 120(%edx,%ebx,8), %xmm1
movsd %xmm1, 56(%esp) # 8-byte Spill
movsd 112(%edx,%ebx,8), %xmm1
movsd %xmm1, 48(%esp) # 8-byte Spill
movsd 104(%edx,%ebx,8), %xmm1
movsd %xmm1, 40(%esp) # 8-byte Spill
movsd 96(%edx,%ebx,8), %xmm1
movsd %xmm1, 32(%esp) # 8-byte Spill
movsd 88(%edx,%ebx,8), %xmm1
movsd %xmm1, 24(%esp) # 8-byte Spill
movsd 80(%edx,%ebx,8), %xmm1
movsd %xmm1, 16(%esp) # 8-byte Spill
movsd 72(%edx,%ebx,8), %xmm1
movsd %xmm1, 8(%esp) # 8-byte Spill
movsd 64(%edx,%ebx,8), %xmm7
movsd 56(%edx,%ebx,8), %xmm1
movsd 48(%edx,%ebx,8), %xmm3
movsd 40(%edx,%ebx,8), %xmm4
movsd 32(%edx,%ebx,8), %xmm5
movsd 24(%edx,%ebx,8), %xmm6
movl $5, (%ecx,%ebx,4)
addsd (%esp), %xmm2 # 8-byte Folded Reload
addsd %xmm6, %xmm2
addsd %xmm5, %xmm2
addsd %xmm4, %xmm2
addsd %xmm3, %xmm2
addsd %xmm1, %xmm2
addsd %xmm7, %xmm2
addsd 8(%esp), %xmm2 # 8-byte Folded Reload
addsd 16(%esp), %xmm2 # 8-byte Folded Reload
addsd 24(%esp), %xmm2 # 8-byte Folded Reload
addsd 32(%esp), %xmm2 # 8-byte Folded Reload
addsd 40(%esp), %xmm2 # 8-byte Folded Reload
addsd 48(%esp), %xmm2 # 8-byte Folded Reload
addsd 56(%esp), %xmm2 # 8-byte Folded Reload
addsd 64(%esp), %xmm2 # 8-byte Folded Reload
addsd 72(%esp), %xmm2 # 8-byte Folded Reload
addsd %xmm2, %xmm0
adcl $0, %esi
cmpl %eax, %edi
movl %edi, %ebx
jne .LBB1_2
.LBB1_3: # %._crit_edge
movsd %xmm0, 80(%esp)
fldl 80(%esp)
addl $88, %esp
popl %esi
popl %edi
popl %ebx
ret
.Ltmp20:
.size foo, .Ltmp20-foo
.Ltmp21:
.cfi_endproc
.Leh_func_end1:
====== Output B ==========================
foo: # @foo
.Ltmp11:
.cfi_startproc
# BB#0:
pushl %edi
.Ltmp12:
.cfi_def_cfa_offset 8
pushl %esi
.Ltmp13:
.cfi_def_cfa_offset 12
subl $12, %esp
.Ltmp14:
.cfi_def_cfa_offset 24
.Ltmp15:
.cfi_offset %esi, -12
.Ltmp16:
.cfi_offset %edi, -8
pxor %xmm0, %xmm0
movl 32(%esp), %eax
testl %eax, %eax
je .LBB1_3
# BB#1:
xorl %esi, %esi
movl 24(%esp), %ecx
pxor %xmm1, %xmm1
xorl %edx, %edx
.align 16, 0x90
.LBB1_2: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%ecx,%esi,8), %xmm2
addsd %xmm1, %xmm2
movl %esi, %edi
addl $1, %edi
addsd (%ecx,%edi,8), %xmm2
addsd 16(%ecx,%esi,8), %xmm2
addsd 24(%ecx,%esi,8), %xmm2
addsd 32(%ecx,%esi,8), %xmm2
addsd 40(%ecx,%esi,8), %xmm2
addsd 48(%ecx,%esi,8), %xmm2
addsd 56(%ecx,%esi,8), %xmm2
addsd 64(%ecx,%esi,8), %xmm2
addsd 72(%ecx,%esi,8), %xmm2
addsd 80(%ecx,%esi,8), %xmm2
addsd 88(%ecx,%esi,8), %xmm2
addsd 96(%ecx,%esi,8), %xmm2
addsd 104(%ecx,%esi,8), %xmm2
addsd 112(%ecx,%esi,8), %xmm2
addsd 120(%ecx,%esi,8), %xmm2
addsd 128(%ecx,%esi,8), %xmm2
addsd 136(%ecx,%esi,8), %xmm2
addsd %xmm2, %xmm0
adcl $0, %edx
cmpl %eax, %edi
movl %edi, %esi
jne .LBB1_2
.LBB1_3: # %._crit_edge
movsd %xmm0, (%esp)
fldl (%esp)
addl $12, %esp
popl %esi
popl %edi
ret
_______________________________________________
LLVM Developers mailing list
LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
It would be unless one actually wanted the unroll to happen. Thank you for the suggestion though. I have filed a bug (http://llvm.org/bugs/show_bug.cgi?id=12392) so hopefully one of the developers can find the bug or suggest a work around that avoids the issue. Brent On Thu, Apr 5, 2012 at 7:36 PM, Patrik Hägglund H <patrik.h.hagglund at ericsson.com> wrote:> I don't know much about this, but maybe -mllvm -unroll-count=1 can be used as a workaround? > > /Patrik Hägglund > > -----Original Message----- > From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On Behalf Of Brent Walker > Sent: den 28 mars 2012 03:18 > To: llvmdev > Subject: [LLVMdev] Suboptimal code due to excessive spilling > > Hi, > > I have run into the following strange behavior and wanted to ask for > some advice. For the C program below, function sum() gets inlined in > foo() but the code generated looks very suboptimal (the code is an > extract from a larger program). > > Below I show the 32-bit x86 assembly as produced by the demo page on > the llvm home page ("Output A"). As you can see from the assembly, > after sum() is inlined and the loop unrolled, the generated code > loads all values of array v (aka &x[i]) into registers before adding > any numbers up -- in the process it runs out of registers and starts > spilling (in essense copying the doubles from one area of memory to > another). After that, it proceeds to add the numbers up. > > But why not add the numbers into 1 register directly? Clearly this is > what the C code is doing -- nothing could have been more explicit. > The really strange thing, is that in the assingment to p[i] is removed > (line marked with "xxx..."), then the code produced is optimal and > exactly what one expects. I show this result in "Output B" where you > get a beatiful sequence of addsd into register xmm2. > > It's all very strange and it points to some questionable decision > making on the part of llvm. I tried different versions of the sum() > function (elliminating the loop for example) but it does not help. > Another observation is that the loop variable i (in foo) must be > involved: if one does *p = 5 (instead of p[i] = 5), the problem also > goes away. > > I would appreciate some advice on how to get around this problem. > > Thank you for any help, > Brent > > > double sum( double* v, int v_siz ) > { > double sum = 0.0; > int i = 0; > > for (; i != v_siz; ++i) > sum += v[i]; > > return sum; > } > > double foo(double *x, int *p, int k) > { > double s = 0.0; > for (int i = 0; i != k;++i) > { > s += sum(&x[i], 18); > p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx > } > return s; > } > > ====== Output A =====> =====================> foo: # @foo > .Ltmp12: > .cfi_startproc > # BB#0: > pushl %ebx > .Ltmp13: > .cfi_def_cfa_offset 8 > pushl %edi > .Ltmp14: > .cfi_def_cfa_offset 12 > pushl %esi > .Ltmp15: > .cfi_def_cfa_offset 16 > subl $88, %esp > .Ltmp16: > .cfi_def_cfa_offset 104 > .Ltmp17: > .cfi_offset %esi, -16 > .Ltmp18: > .cfi_offset %edi, -12 > .Ltmp19: > .cfi_offset %ebx, -8 > pxor %xmm0, %xmm0 > movl 112(%esp), %eax > testl %eax, %eax > je .LBB1_3 > # BB#1: > xorl %ebx, %ebx > movl 108(%esp), %ecx > movl 104(%esp), %edx > xorl %esi, %esi > .align 16, 0x90 > .LBB1_2: # %.lr.ph.i > # =>This Inner Loop Header: Depth=1 > movsd (%edx,%ebx,8), %xmm2 > addsd .LCPI1_0, %xmm2 > movsd 16(%edx,%ebx,8), %xmm1 > movsd %xmm1, (%esp) # 8-byte Spill > movl %ebx, %edi > addl $1, %edi > addsd (%edx,%edi,8), %xmm2 > movsd 136(%edx,%ebx,8), %xmm1 > movsd %xmm1, 72(%esp) # 8-byte Spill > movsd 128(%edx,%ebx,8), %xmm1 > movsd %xmm1, 64(%esp) # 8-byte Spill > movsd 120(%edx,%ebx,8), %xmm1 > movsd %xmm1, 56(%esp) # 8-byte Spill > movsd 112(%edx,%ebx,8), %xmm1 > movsd %xmm1, 48(%esp) # 8-byte Spill > movsd 104(%edx,%ebx,8), %xmm1 > movsd %xmm1, 40(%esp) # 8-byte Spill > movsd 96(%edx,%ebx,8), %xmm1 > movsd %xmm1, 32(%esp) # 8-byte Spill > movsd 88(%edx,%ebx,8), %xmm1 > movsd %xmm1, 24(%esp) # 8-byte Spill > movsd 80(%edx,%ebx,8), %xmm1 > movsd %xmm1, 16(%esp) # 8-byte Spill > movsd 72(%edx,%ebx,8), %xmm1 > movsd %xmm1, 8(%esp) # 8-byte Spill > movsd 64(%edx,%ebx,8), %xmm7 > movsd 56(%edx,%ebx,8), %xmm1 > movsd 48(%edx,%ebx,8), %xmm3 > movsd 40(%edx,%ebx,8), %xmm4 > movsd 32(%edx,%ebx,8), %xmm5 > movsd 24(%edx,%ebx,8), %xmm6 > movl $5, (%ecx,%ebx,4) > addsd (%esp), %xmm2 # 8-byte Folded Reload > addsd %xmm6, %xmm2 > addsd %xmm5, %xmm2 > addsd %xmm4, %xmm2 > addsd %xmm3, %xmm2 > addsd %xmm1, %xmm2 > addsd %xmm7, %xmm2 > addsd 8(%esp), %xmm2 # 8-byte Folded Reload > addsd 16(%esp), %xmm2 # 8-byte Folded Reload > addsd 24(%esp), %xmm2 # 8-byte Folded Reload > addsd 32(%esp), %xmm2 # 8-byte Folded Reload > addsd 40(%esp), %xmm2 # 8-byte Folded Reload > addsd 48(%esp), %xmm2 # 8-byte Folded Reload > addsd 56(%esp), %xmm2 # 8-byte Folded Reload > addsd 64(%esp), %xmm2 # 8-byte Folded Reload > addsd 72(%esp), %xmm2 # 8-byte Folded Reload > addsd %xmm2, %xmm0 > adcl $0, %esi > cmpl %eax, %edi > movl %edi, %ebx > jne .LBB1_2 > .LBB1_3: # %._crit_edge > movsd %xmm0, 80(%esp) > fldl 80(%esp) > addl $88, %esp > popl %esi > popl %edi > popl %ebx > ret > .Ltmp20: > .size foo, .Ltmp20-foo > .Ltmp21: > .cfi_endproc > .Leh_func_end1: > > > ====== Output B =====> =====================> > foo: # @foo > .Ltmp11: > .cfi_startproc > # BB#0: > pushl %edi > .Ltmp12: > .cfi_def_cfa_offset 8 > pushl %esi > .Ltmp13: > .cfi_def_cfa_offset 12 > subl $12, %esp > .Ltmp14: > .cfi_def_cfa_offset 24 > .Ltmp15: > .cfi_offset %esi, -12 > .Ltmp16: > .cfi_offset %edi, -8 > pxor %xmm0, %xmm0 > movl 32(%esp), %eax > testl %eax, %eax > je .LBB1_3 > # BB#1: > xorl %esi, %esi > movl 24(%esp), %ecx > pxor %xmm1, %xmm1 > xorl %edx, %edx > .align 16, 0x90 > .LBB1_2: # %.lr.ph.i > # =>This Inner Loop Header: Depth=1 > movsd (%ecx,%esi,8), %xmm2 > addsd %xmm1, %xmm2 > movl %esi, %edi > addl $1, %edi > addsd (%ecx,%edi,8), %xmm2 > addsd 16(%ecx,%esi,8), %xmm2 > addsd 24(%ecx,%esi,8), %xmm2 > addsd 32(%ecx,%esi,8), %xmm2 > addsd 40(%ecx,%esi,8), %xmm2 > addsd 48(%ecx,%esi,8), %xmm2 > addsd 56(%ecx,%esi,8), %xmm2 > addsd 64(%ecx,%esi,8), %xmm2 > addsd 72(%ecx,%esi,8), %xmm2 > addsd 80(%ecx,%esi,8), %xmm2 > addsd 88(%ecx,%esi,8), %xmm2 > addsd 96(%ecx,%esi,8), %xmm2 > addsd 104(%ecx,%esi,8), %xmm2 > addsd 112(%ecx,%esi,8), %xmm2 > addsd 120(%ecx,%esi,8), %xmm2 > addsd 128(%ecx,%esi,8), %xmm2 > addsd 136(%ecx,%esi,8), %xmm2 > addsd %xmm2, %xmm0 > adcl $0, %edx > cmpl %eax, %edi > movl %edi, %esi > jne .LBB1_2 > .LBB1_3: # %._crit_edge > movsd %xmm0, (%esp) > fldl (%esp) > addl $12, %esp > popl %esi > popl %edi > ret > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Reasonably Related Threads
- [LLVMdev] Suboptimal code due to excessive spilling
- [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm
- [LLVMdev] SIMD instructions and memory alignment on X86
- [LLVMdev] How can I compile a c source file to use SSE2 Data Movement Instructions?
- [test-suite] making polybench/symm succeed with "-Ofast" and "-ffp-contract=on"