thr3ads.net - llvm dev - [LLVMdev] Suboptimal code due to excessive spilling [Mar 2012]

If this information is useful, please help other people find it:
Share via:

Brent Walker

2012-Mar-28 01:17 UTC

[LLVMdev] Suboptimal code due to excessive spilling

Hi,

I have run into the following strange behavior and wanted to ask for
some advice.  For the C program below, function sum() gets inlined in
foo() but the code generated looks very suboptimal (the code is an
extract from a larger program).

Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A").  As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another).  After that, it proceeds to add the numbers up.

But why not add the numbers into 1 register directly?  Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects.  I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.

It's all very strange and it points to some questionable decision
making on the part of llvm.  I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.

I would appreciate some advice on how to get around this problem.

Thank you for any help,
Brent


double sum( double* v, int v_siz )
{
    double sum = 0.0;
    int i = 0;

    for (; i != v_siz; ++i)
        sum += v[i];

    return sum;
}

double foo(double *x, int *p, int k)
{
    double s = 0.0;
    for (int i = 0; i != k;++i)
    {
       s += sum(&x[i], 18);
       p[i] = 5;   // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    }
    return s;
}

====== Output A ==========================foo:                                  
# @foo
.Ltmp12:
	.cfi_startproc
# BB#0:
	pushl	%ebx
.Ltmp13:
	.cfi_def_cfa_offset 8
	pushl	%edi
.Ltmp14:
	.cfi_def_cfa_offset 12
	pushl	%esi
.Ltmp15:
	.cfi_def_cfa_offset 16
	subl	$88, %esp
.Ltmp16:
	.cfi_def_cfa_offset 104
.Ltmp17:
	.cfi_offset %esi, -16
.Ltmp18:
	.cfi_offset %edi, -12
.Ltmp19:
	.cfi_offset %ebx, -8
	pxor	%xmm0, %xmm0
	movl	112(%esp), %eax
	testl	%eax, %eax
	je	.LBB1_3
# BB#1:
	xorl	%ebx, %ebx
	movl	108(%esp), %ecx
	movl	104(%esp), %edx
	xorl	%esi, %esi
	.align	16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                        # =>This Inner Loop Header: Depth=1
	movsd	(%edx,%ebx,8), %xmm2
	addsd	.LCPI1_0, %xmm2
	movsd	16(%edx,%ebx,8), %xmm1
	movsd	%xmm1, (%esp)           # 8-byte Spill
	movl	%ebx, %edi
	addl	$1, %edi
	addsd	(%edx,%edi,8), %xmm2
	movsd	136(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 72(%esp)         # 8-byte Spill
	movsd	128(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 64(%esp)         # 8-byte Spill
	movsd	120(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 56(%esp)         # 8-byte Spill
	movsd	112(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 48(%esp)         # 8-byte Spill
	movsd	104(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 40(%esp)         # 8-byte Spill
	movsd	96(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 32(%esp)         # 8-byte Spill
	movsd	88(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 24(%esp)         # 8-byte Spill
	movsd	80(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 16(%esp)         # 8-byte Spill
	movsd	72(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 8(%esp)          # 8-byte Spill
	movsd	64(%edx,%ebx,8), %xmm7
	movsd	56(%edx,%ebx,8), %xmm1
	movsd	48(%edx,%ebx,8), %xmm3
	movsd	40(%edx,%ebx,8), %xmm4
	movsd	32(%edx,%ebx,8), %xmm5
	movsd	24(%edx,%ebx,8), %xmm6
	movl	$5, (%ecx,%ebx,4)
	addsd	(%esp), %xmm2           # 8-byte Folded Reload
	addsd	%xmm6, %xmm2
	addsd	%xmm5, %xmm2
	addsd	%xmm4, %xmm2
	addsd	%xmm3, %xmm2
	addsd	%xmm1, %xmm2
	addsd	%xmm7, %xmm2
	addsd	8(%esp), %xmm2          # 8-byte Folded Reload
	addsd	16(%esp), %xmm2         # 8-byte Folded Reload
	addsd	24(%esp), %xmm2         # 8-byte Folded Reload
	addsd	32(%esp), %xmm2         # 8-byte Folded Reload
	addsd	40(%esp), %xmm2         # 8-byte Folded Reload
	addsd	48(%esp), %xmm2         # 8-byte Folded Reload
	addsd	56(%esp), %xmm2         # 8-byte Folded Reload
	addsd	64(%esp), %xmm2         # 8-byte Folded Reload
	addsd	72(%esp), %xmm2         # 8-byte Folded Reload
	addsd	%xmm2, %xmm0
	adcl	$0, %esi
	cmpl	%eax, %edi
	movl	%edi, %ebx
	jne	.LBB1_2
.LBB1_3:                                # %._crit_edge
	movsd	%xmm0, 80(%esp)
	fldl	80(%esp)
	addl	$88, %esp
	popl	%esi
	popl	%edi
	popl	%ebx
	ret
.Ltmp20:
	.size	foo, .Ltmp20-foo
.Ltmp21:
	.cfi_endproc
.Leh_func_end1:


====== Output B ==========================
foo:                                    # @foo
.Ltmp11:
	.cfi_startproc
# BB#0:
	pushl	%edi
.Ltmp12:
	.cfi_def_cfa_offset 8
	pushl	%esi
.Ltmp13:
	.cfi_def_cfa_offset 12
	subl	$12, %esp
.Ltmp14:
	.cfi_def_cfa_offset 24
.Ltmp15:
	.cfi_offset %esi, -12
.Ltmp16:
	.cfi_offset %edi, -8
	pxor	%xmm0, %xmm0
	movl	32(%esp), %eax
	testl	%eax, %eax
	je	.LBB1_3
# BB#1:
	xorl	%esi, %esi
	movl	24(%esp), %ecx
	pxor	%xmm1, %xmm1
	xorl	%edx, %edx
	.align	16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                        # =>This Inner Loop Header: Depth=1
	movsd	(%ecx,%esi,8), %xmm2
	addsd	%xmm1, %xmm2
	movl	%esi, %edi
	addl	$1, %edi
	addsd	(%ecx,%edi,8), %xmm2
	addsd	16(%ecx,%esi,8), %xmm2
	addsd	24(%ecx,%esi,8), %xmm2
	addsd	32(%ecx,%esi,8), %xmm2
	addsd	40(%ecx,%esi,8), %xmm2
	addsd	48(%ecx,%esi,8), %xmm2
	addsd	56(%ecx,%esi,8), %xmm2
	addsd	64(%ecx,%esi,8), %xmm2
	addsd	72(%ecx,%esi,8), %xmm2
	addsd	80(%ecx,%esi,8), %xmm2
	addsd	88(%ecx,%esi,8), %xmm2
	addsd	96(%ecx,%esi,8), %xmm2
	addsd	104(%ecx,%esi,8), %xmm2
	addsd	112(%ecx,%esi,8), %xmm2
	addsd	120(%ecx,%esi,8), %xmm2
	addsd	128(%ecx,%esi,8), %xmm2
	addsd	136(%ecx,%esi,8), %xmm2
	addsd	%xmm2, %xmm0
	adcl	$0, %edx
	cmpl	%eax, %edi
	movl	%edi, %esi
	jne	.LBB1_2
.LBB1_3:                                # %._crit_edge
	movsd	%xmm0, (%esp)
	fldl	(%esp)
	addl	$12, %esp
	popl	%esi
	popl	%edi
	ret

Patrik Hägglund H

2012-Apr-05 10:36 UTC

head link

[LLVMdev] Suboptimal code due to excessive spilling

I don't know much about this, but maybe -mllvm -unroll-count=1 can be used
as a workaround?

/Patrik Hägglund 

-----Original Message-----
From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On
Behalf Of Brent Walker
Sent: den 28 mars 2012 03:18
To: llvmdev
Subject: [LLVMdev] Suboptimal code due to excessive spilling

Hi,

I have run into the following strange behavior and wanted to ask for
some advice.  For the C program below, function sum() gets inlined in
foo() but the code generated looks very suboptimal (the code is an
extract from a larger program).

Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A").  As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another).  After that, it proceeds to add the numbers up.

But why not add the numbers into 1 register directly?  Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects.  I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.

It's all very strange and it points to some questionable decision
making on the part of llvm.  I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.

I would appreciate some advice on how to get around this problem.

Thank you for any help,
Brent


double sum( double* v, int v_siz )
{
    double sum = 0.0;
    int i = 0;

    for (; i != v_siz; ++i)
        sum += v[i];

    return sum;
}

double foo(double *x, int *p, int k)
{
    double s = 0.0;
    for (int i = 0; i != k;++i)
    {
       s += sum(&x[i], 18);
       p[i] = 5;   // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    }
    return s;
}

====== Output A ==========================foo:                                  
# @foo
.Ltmp12:
	.cfi_startproc
# BB#0:
	pushl	%ebx
.Ltmp13:
	.cfi_def_cfa_offset 8
	pushl	%edi
.Ltmp14:
	.cfi_def_cfa_offset 12
	pushl	%esi
.Ltmp15:
	.cfi_def_cfa_offset 16
	subl	$88, %esp
.Ltmp16:
	.cfi_def_cfa_offset 104
.Ltmp17:
	.cfi_offset %esi, -16
.Ltmp18:
	.cfi_offset %edi, -12
.Ltmp19:
	.cfi_offset %ebx, -8
	pxor	%xmm0, %xmm0
	movl	112(%esp), %eax
	testl	%eax, %eax
	je	.LBB1_3
# BB#1:
	xorl	%ebx, %ebx
	movl	108(%esp), %ecx
	movl	104(%esp), %edx
	xorl	%esi, %esi
	.align	16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                        # =>This Inner Loop Header: Depth=1
	movsd	(%edx,%ebx,8), %xmm2
	addsd	.LCPI1_0, %xmm2
	movsd	16(%edx,%ebx,8), %xmm1
	movsd	%xmm1, (%esp)           # 8-byte Spill
	movl	%ebx, %edi
	addl	$1, %edi
	addsd	(%edx,%edi,8), %xmm2
	movsd	136(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 72(%esp)         # 8-byte Spill
	movsd	128(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 64(%esp)         # 8-byte Spill
	movsd	120(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 56(%esp)         # 8-byte Spill
	movsd	112(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 48(%esp)         # 8-byte Spill
	movsd	104(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 40(%esp)         # 8-byte Spill
	movsd	96(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 32(%esp)         # 8-byte Spill
	movsd	88(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 24(%esp)         # 8-byte Spill
	movsd	80(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 16(%esp)         # 8-byte Spill
	movsd	72(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 8(%esp)          # 8-byte Spill
	movsd	64(%edx,%ebx,8), %xmm7
	movsd	56(%edx,%ebx,8), %xmm1
	movsd	48(%edx,%ebx,8), %xmm3
	movsd	40(%edx,%ebx,8), %xmm4
	movsd	32(%edx,%ebx,8), %xmm5
	movsd	24(%edx,%ebx,8), %xmm6
	movl	$5, (%ecx,%ebx,4)
	addsd	(%esp), %xmm2           # 8-byte Folded Reload
	addsd	%xmm6, %xmm2
	addsd	%xmm5, %xmm2
	addsd	%xmm4, %xmm2
	addsd	%xmm3, %xmm2
	addsd	%xmm1, %xmm2
	addsd	%xmm7, %xmm2
	addsd	8(%esp), %xmm2          # 8-byte Folded Reload
	addsd	16(%esp), %xmm2         # 8-byte Folded Reload
	addsd	24(%esp), %xmm2         # 8-byte Folded Reload
	addsd	32(%esp), %xmm2         # 8-byte Folded Reload
	addsd	40(%esp), %xmm2         # 8-byte Folded Reload
	addsd	48(%esp), %xmm2         # 8-byte Folded Reload
	addsd	56(%esp), %xmm2         # 8-byte Folded Reload
	addsd	64(%esp), %xmm2         # 8-byte Folded Reload
	addsd	72(%esp), %xmm2         # 8-byte Folded Reload
	addsd	%xmm2, %xmm0
	adcl	$0, %esi
	cmpl	%eax, %edi
	movl	%edi, %ebx
	jne	.LBB1_2
.LBB1_3:                                # %._crit_edge
	movsd	%xmm0, 80(%esp)
	fldl	80(%esp)
	addl	$88, %esp
	popl	%esi
	popl	%edi
	popl	%ebx
	ret
.Ltmp20:
	.size	foo, .Ltmp20-foo
.Ltmp21:
	.cfi_endproc
.Leh_func_end1:


====== Output B ==========================
foo:                                    # @foo
.Ltmp11:
	.cfi_startproc
# BB#0:
	pushl	%edi
.Ltmp12:
	.cfi_def_cfa_offset 8
	pushl	%esi
.Ltmp13:
	.cfi_def_cfa_offset 12
	subl	$12, %esp
.Ltmp14:
	.cfi_def_cfa_offset 24
.Ltmp15:
	.cfi_offset %esi, -12
.Ltmp16:
	.cfi_offset %edi, -8
	pxor	%xmm0, %xmm0
	movl	32(%esp), %eax
	testl	%eax, %eax
	je	.LBB1_3
# BB#1:
	xorl	%esi, %esi
	movl	24(%esp), %ecx
	pxor	%xmm1, %xmm1
	xorl	%edx, %edx
	.align	16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                        # =>This Inner Loop Header: Depth=1
	movsd	(%ecx,%esi,8), %xmm2
	addsd	%xmm1, %xmm2
	movl	%esi, %edi
	addl	$1, %edi
	addsd	(%ecx,%edi,8), %xmm2
	addsd	16(%ecx,%esi,8), %xmm2
	addsd	24(%ecx,%esi,8), %xmm2
	addsd	32(%ecx,%esi,8), %xmm2
	addsd	40(%ecx,%esi,8), %xmm2
	addsd	48(%ecx,%esi,8), %xmm2
	addsd	56(%ecx,%esi,8), %xmm2
	addsd	64(%ecx,%esi,8), %xmm2
	addsd	72(%ecx,%esi,8), %xmm2
	addsd	80(%ecx,%esi,8), %xmm2
	addsd	88(%ecx,%esi,8), %xmm2
	addsd	96(%ecx,%esi,8), %xmm2
	addsd	104(%ecx,%esi,8), %xmm2
	addsd	112(%ecx,%esi,8), %xmm2
	addsd	120(%ecx,%esi,8), %xmm2
	addsd	128(%ecx,%esi,8), %xmm2
	addsd	136(%ecx,%esi,8), %xmm2
	addsd	%xmm2, %xmm0
	adcl	$0, %edx
	cmpl	%eax, %edi
	movl	%edi, %esi
	jne	.LBB1_2
.LBB1_3:                                # %._crit_edge
	movsd	%xmm0, (%esp)
	fldl	(%esp)
	addl	$12, %esp
	popl	%esi
	popl	%edi
	ret
_______________________________________________
LLVM Developers mailing list
LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Brent Walker

2012-Apr-05 16:14 UTC

head link

[LLVMdev] Suboptimal code due to excessive spilling

It would be unless one actually wanted the unroll to happen.  Thank
you for the suggestion though.  I have filed a bug
(http://llvm.org/bugs/show_bug.cgi?id=12392) so hopefully one of the
developers can find the bug or suggest a work around that avoids the
issue.

Brent


On Thu, Apr 5, 2012 at 7:36 PM, Patrik Hägglund H
<patrik.h.hagglund at ericsson.com> wrote:> I don't know much about this, but maybe -mllvm -unroll-count=1 can be
used as a workaround?
>
> /Patrik Hägglund
>
> -----Original Message-----
> From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at
cs.uiuc.edu] On Behalf Of Brent Walker
> Sent: den 28 mars 2012 03:18
> To: llvmdev
> Subject: [LLVMdev] Suboptimal code due to excessive spilling
>
> Hi,
>
> I have run into the following strange behavior and wanted to ask for
> some advice.  For the C program below, function sum() gets inlined in
> foo() but the code generated looks very suboptimal (the code is an
> extract from a larger program).
>
> Below I show the 32-bit x86 assembly as produced by the demo page on
> the llvm home page ("Output A").  As you can see from the
assembly,
> after sum() is inlined and the loop unrolled, the generated code
> loads all values of array v (aka &x[i]) into registers before adding
> any numbers up -- in the process it runs out of registers and starts
> spilling (in essense copying the doubles from one area of memory to
> another).  After that, it proceeds to add the numbers up.
>
> But why not add the numbers into 1 register directly?  Clearly this is
> what the C code is doing -- nothing could have been more explicit.
> The really strange thing, is that in the assingment to p[i] is removed
> (line marked with "xxx..."), then the code produced is optimal
and
> exactly what one expects.  I show this result in "Output B" where
you
> get a beatiful sequence of addsd into register xmm2.
>
> It's all very strange and it points to some questionable decision
> making on the part of llvm.  I tried different versions of the sum()
> function (elliminating the loop for example) but it does not help.
> Another observation is that the loop variable i (in foo) must be
> involved: if one does *p = 5 (instead of p[i] = 5), the problem also
> goes away.
>
> I would appreciate some advice on how to get around this problem.
>
> Thank you for any help,
> Brent
>
>
> double sum( double* v, int v_siz )
> {
>    double sum = 0.0;
>    int i = 0;
>
>    for (; i != v_siz; ++i)
>        sum += v[i];
>
>    return sum;
> }
>
> double foo(double *x, int *p, int k)
> {
>    double s = 0.0;
>    for (int i = 0; i != k;++i)
>    {
>       s += sum(&x[i], 18);
>       p[i] = 5;   // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
>    }
>    return s;
> }
>
> ====== Output A =====> =====================> foo:                  
                 # @foo
> .Ltmp12:
>        .cfi_startproc
> # BB#0:
>        pushl   %ebx
> .Ltmp13:
>        .cfi_def_cfa_offset 8
>        pushl   %edi
> .Ltmp14:
>        .cfi_def_cfa_offset 12
>        pushl   %esi
> .Ltmp15:
>        .cfi_def_cfa_offset 16
>        subl    $88, %esp
> .Ltmp16:
>        .cfi_def_cfa_offset 104
> .Ltmp17:
>        .cfi_offset %esi, -16
> .Ltmp18:
>        .cfi_offset %edi, -12
> .Ltmp19:
>        .cfi_offset %ebx, -8
>        pxor    %xmm0, %xmm0
>        movl    112(%esp), %eax
>        testl   %eax, %eax
>        je      .LBB1_3
> # BB#1:
>        xorl    %ebx, %ebx
>        movl    108(%esp), %ecx
>        movl    104(%esp), %edx
>        xorl    %esi, %esi
>        .align  16, 0x90
> .LBB1_2:                                # %.lr.ph.i
>                                        # =>This Inner Loop Header:
Depth=1
>        movsd   (%edx,%ebx,8), %xmm2
>        addsd   .LCPI1_0, %xmm2
>        movsd   16(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, (%esp)           # 8-byte Spill
>        movl    %ebx, %edi
>        addl    $1, %edi
>        addsd   (%edx,%edi,8), %xmm2
>        movsd   136(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 72(%esp)         # 8-byte Spill
>        movsd   128(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 64(%esp)         # 8-byte Spill
>        movsd   120(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 56(%esp)         # 8-byte Spill
>        movsd   112(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 48(%esp)         # 8-byte Spill
>        movsd   104(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 40(%esp)         # 8-byte Spill
>        movsd   96(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 32(%esp)         # 8-byte Spill
>        movsd   88(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 24(%esp)         # 8-byte Spill
>        movsd   80(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 16(%esp)         # 8-byte Spill
>        movsd   72(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 8(%esp)          # 8-byte Spill
>        movsd   64(%edx,%ebx,8), %xmm7
>        movsd   56(%edx,%ebx,8), %xmm1
>        movsd   48(%edx,%ebx,8), %xmm3
>        movsd   40(%edx,%ebx,8), %xmm4
>        movsd   32(%edx,%ebx,8), %xmm5
>        movsd   24(%edx,%ebx,8), %xmm6
>        movl    $5, (%ecx,%ebx,4)
>        addsd   (%esp), %xmm2           # 8-byte Folded Reload
>        addsd   %xmm6, %xmm2
>        addsd   %xmm5, %xmm2
>        addsd   %xmm4, %xmm2
>        addsd   %xmm3, %xmm2
>        addsd   %xmm1, %xmm2
>        addsd   %xmm7, %xmm2
>        addsd   8(%esp), %xmm2          # 8-byte Folded Reload
>        addsd   16(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   24(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   32(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   40(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   48(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   56(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   64(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   72(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   %xmm2, %xmm0
>        adcl    $0, %esi
>        cmpl    %eax, %edi
>        movl    %edi, %ebx
>        jne     .LBB1_2
> .LBB1_3:                                # %._crit_edge
>        movsd   %xmm0, 80(%esp)
>        fldl    80(%esp)
>        addl    $88, %esp
>        popl    %esi
>        popl    %edi
>        popl    %ebx
>        ret
> .Ltmp20:
>        .size   foo, .Ltmp20-foo
> .Ltmp21:
>        .cfi_endproc
> .Leh_func_end1:
>
>
> ====== Output B =====> =====================>
> foo:                                    # @foo
> .Ltmp11:
>        .cfi_startproc
> # BB#0:
>        pushl   %edi
> .Ltmp12:
>        .cfi_def_cfa_offset 8
>        pushl   %esi
> .Ltmp13:
>        .cfi_def_cfa_offset 12
>        subl    $12, %esp
> .Ltmp14:
>        .cfi_def_cfa_offset 24
> .Ltmp15:
>        .cfi_offset %esi, -12
> .Ltmp16:
>        .cfi_offset %edi, -8
>        pxor    %xmm0, %xmm0
>        movl    32(%esp), %eax
>        testl   %eax, %eax
>        je      .LBB1_3
> # BB#1:
>        xorl    %esi, %esi
>        movl    24(%esp), %ecx
>        pxor    %xmm1, %xmm1
>        xorl    %edx, %edx
>        .align  16, 0x90
> .LBB1_2:                                # %.lr.ph.i
>                                        # =>This Inner Loop Header:
Depth=1
>        movsd   (%ecx,%esi,8), %xmm2
>        addsd   %xmm1, %xmm2
>        movl    %esi, %edi
>        addl    $1, %edi
>        addsd   (%ecx,%edi,8), %xmm2
>        addsd   16(%ecx,%esi,8), %xmm2
>        addsd   24(%ecx,%esi,8), %xmm2
>        addsd   32(%ecx,%esi,8), %xmm2
>        addsd   40(%ecx,%esi,8), %xmm2
>        addsd   48(%ecx,%esi,8), %xmm2
>        addsd   56(%ecx,%esi,8), %xmm2
>        addsd   64(%ecx,%esi,8), %xmm2
>        addsd   72(%ecx,%esi,8), %xmm2
>        addsd   80(%ecx,%esi,8), %xmm2
>        addsd   88(%ecx,%esi,8), %xmm2
>        addsd   96(%ecx,%esi,8), %xmm2
>        addsd   104(%ecx,%esi,8), %xmm2
>        addsd   112(%ecx,%esi,8), %xmm2
>        addsd   120(%ecx,%esi,8), %xmm2
>        addsd   128(%ecx,%esi,8), %xmm2
>        addsd   136(%ecx,%esi,8), %xmm2
>        addsd   %xmm2, %xmm0
>        adcl    $0, %edx
>        cmpl    %eax, %edi
>        movl    %edi, %esi
>        jne     .LBB1_2
> .LBB1_3:                                # %._crit_edge
>        movsd   %xmm0, (%esp)
>        fldl    (%esp)
>        addl    $12, %esp
>        popl    %esi
>        popl    %edi
>        ret
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Reasonably Related Threads

Search for more possibly parallel threads

llvm dev - Mar 2012 - [LLVMdev] Suboptimal code due to excessive spilling

[LLVMdev] Suboptimal code due to excessive spilling

[LLVMdev] Suboptimal code due to excessive spilling

[LLVMdev] Suboptimal code due to excessive spilling

Reasonably Related Threads