Hi, I have run into the following strange behavior and wanted to ask for some advice. For the C program below, function sum() gets inlined in foo() but the code generated looks very suboptimal (the code is an extract from a larger program). Below I show the 32-bit x86 assembly as produced by the demo page on the llvm home page ("Output A"). As you can see from the assembly, after sum() is inlined and the loop unrolled, the generated code loads all values of array v (aka &x[i]) into registers before adding any numbers up -- in the process it runs out of registers and starts spilling (in essense copying the doubles from one area of memory to another). After that, it proceeds to add the numbers up. But why not add the numbers into 1 register directly? Clearly this is what the C code is doing -- nothing could have been more explicit. The really strange thing, is that in the assingment to p[i] is removed (line marked with "xxx..."), then the code produced is optimal and exactly what one expects. I show this result in "Output B" where you get a beatiful sequence of addsd into register xmm2. It's all very strange and it points to some questionable decision making on the part of llvm. I tried different versions of the sum() function (elliminating the loop for example) but it does not help. Another observation is that the loop variable i (in foo) must be involved: if one does *p = 5 (instead of p[i] = 5), the problem also goes away. I would appreciate some advice on how to get around this problem. Thank you for any help, Brent double sum( double* v, int v_siz ) { double sum = 0.0; int i = 0; for (; i != v_siz; ++i) sum += v[i]; return sum; } double foo(double *x, int *p, int k) { double s = 0.0; for (int i = 0; i != k;++i) { s += sum(&x[i], 18); p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx } return s; } ====== Output A ==========================foo: # @foo .Ltmp12: .cfi_startproc # BB#0: pushl %ebx .Ltmp13: .cfi_def_cfa_offset 8 pushl %edi .Ltmp14: .cfi_def_cfa_offset 12 pushl %esi .Ltmp15: .cfi_def_cfa_offset 16 subl $88, %esp .Ltmp16: .cfi_def_cfa_offset 104 .Ltmp17: .cfi_offset %esi, -16 .Ltmp18: .cfi_offset %edi, -12 .Ltmp19: .cfi_offset %ebx, -8 pxor %xmm0, %xmm0 movl 112(%esp), %eax testl %eax, %eax je .LBB1_3 # BB#1: xorl %ebx, %ebx movl 108(%esp), %ecx movl 104(%esp), %edx xorl %esi, %esi .align 16, 0x90 .LBB1_2: # %.lr.ph.i # =>This Inner Loop Header: Depth=1 movsd (%edx,%ebx,8), %xmm2 addsd .LCPI1_0, %xmm2 movsd 16(%edx,%ebx,8), %xmm1 movsd %xmm1, (%esp) # 8-byte Spill movl %ebx, %edi addl $1, %edi addsd (%edx,%edi,8), %xmm2 movsd 136(%edx,%ebx,8), %xmm1 movsd %xmm1, 72(%esp) # 8-byte Spill movsd 128(%edx,%ebx,8), %xmm1 movsd %xmm1, 64(%esp) # 8-byte Spill movsd 120(%edx,%ebx,8), %xmm1 movsd %xmm1, 56(%esp) # 8-byte Spill movsd 112(%edx,%ebx,8), %xmm1 movsd %xmm1, 48(%esp) # 8-byte Spill movsd 104(%edx,%ebx,8), %xmm1 movsd %xmm1, 40(%esp) # 8-byte Spill movsd 96(%edx,%ebx,8), %xmm1 movsd %xmm1, 32(%esp) # 8-byte Spill movsd 88(%edx,%ebx,8), %xmm1 movsd %xmm1, 24(%esp) # 8-byte Spill movsd 80(%edx,%ebx,8), %xmm1 movsd %xmm1, 16(%esp) # 8-byte Spill movsd 72(%edx,%ebx,8), %xmm1 movsd %xmm1, 8(%esp) # 8-byte Spill movsd 64(%edx,%ebx,8), %xmm7 movsd 56(%edx,%ebx,8), %xmm1 movsd 48(%edx,%ebx,8), %xmm3 movsd 40(%edx,%ebx,8), %xmm4 movsd 32(%edx,%ebx,8), %xmm5 movsd 24(%edx,%ebx,8), %xmm6 movl $5, (%ecx,%ebx,4) addsd (%esp), %xmm2 # 8-byte Folded Reload addsd %xmm6, %xmm2 addsd %xmm5, %xmm2 addsd %xmm4, %xmm2 addsd %xmm3, %xmm2 addsd %xmm1, %xmm2 addsd %xmm7, %xmm2 addsd 8(%esp), %xmm2 # 8-byte Folded Reload addsd 16(%esp), %xmm2 # 8-byte Folded Reload addsd 24(%esp), %xmm2 # 8-byte Folded Reload addsd 32(%esp), %xmm2 # 8-byte Folded Reload addsd 40(%esp), %xmm2 # 8-byte Folded Reload addsd 48(%esp), %xmm2 # 8-byte Folded Reload addsd 56(%esp), %xmm2 # 8-byte Folded Reload addsd 64(%esp), %xmm2 # 8-byte Folded Reload addsd 72(%esp), %xmm2 # 8-byte Folded Reload addsd %xmm2, %xmm0 adcl $0, %esi cmpl %eax, %edi movl %edi, %ebx jne .LBB1_2 .LBB1_3: # %._crit_edge movsd %xmm0, 80(%esp) fldl 80(%esp) addl $88, %esp popl %esi popl %edi popl %ebx ret .Ltmp20: .size foo, .Ltmp20-foo .Ltmp21: .cfi_endproc .Leh_func_end1: ====== Output B ========================== foo: # @foo .Ltmp11: .cfi_startproc # BB#0: pushl %edi .Ltmp12: .cfi_def_cfa_offset 8 pushl %esi .Ltmp13: .cfi_def_cfa_offset 12 subl $12, %esp .Ltmp14: .cfi_def_cfa_offset 24 .Ltmp15: .cfi_offset %esi, -12 .Ltmp16: .cfi_offset %edi, -8 pxor %xmm0, %xmm0 movl 32(%esp), %eax testl %eax, %eax je .LBB1_3 # BB#1: xorl %esi, %esi movl 24(%esp), %ecx pxor %xmm1, %xmm1 xorl %edx, %edx .align 16, 0x90 .LBB1_2: # %.lr.ph.i # =>This Inner Loop Header: Depth=1 movsd (%ecx,%esi,8), %xmm2 addsd %xmm1, %xmm2 movl %esi, %edi addl $1, %edi addsd (%ecx,%edi,8), %xmm2 addsd 16(%ecx,%esi,8), %xmm2 addsd 24(%ecx,%esi,8), %xmm2 addsd 32(%ecx,%esi,8), %xmm2 addsd 40(%ecx,%esi,8), %xmm2 addsd 48(%ecx,%esi,8), %xmm2 addsd 56(%ecx,%esi,8), %xmm2 addsd 64(%ecx,%esi,8), %xmm2 addsd 72(%ecx,%esi,8), %xmm2 addsd 80(%ecx,%esi,8), %xmm2 addsd 88(%ecx,%esi,8), %xmm2 addsd 96(%ecx,%esi,8), %xmm2 addsd 104(%ecx,%esi,8), %xmm2 addsd 112(%ecx,%esi,8), %xmm2 addsd 120(%ecx,%esi,8), %xmm2 addsd 128(%ecx,%esi,8), %xmm2 addsd 136(%ecx,%esi,8), %xmm2 addsd %xmm2, %xmm0 adcl $0, %edx cmpl %eax, %edi movl %edi, %esi jne .LBB1_2 .LBB1_3: # %._crit_edge movsd %xmm0, (%esp) fldl (%esp) addl $12, %esp popl %esi popl %edi ret
Patrik Hägglund H
2012-Apr-05 10:36 UTC
[LLVMdev] Suboptimal code due to excessive spilling
I don't know much about this, but maybe -mllvm -unroll-count=1 can be used as a workaround? /Patrik Hägglund -----Original Message----- From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On Behalf Of Brent Walker Sent: den 28 mars 2012 03:18 To: llvmdev Subject: [LLVMdev] Suboptimal code due to excessive spilling Hi, I have run into the following strange behavior and wanted to ask for some advice. For the C program below, function sum() gets inlined in foo() but the code generated looks very suboptimal (the code is an extract from a larger program). Below I show the 32-bit x86 assembly as produced by the demo page on the llvm home page ("Output A"). As you can see from the assembly, after sum() is inlined and the loop unrolled, the generated code loads all values of array v (aka &x[i]) into registers before adding any numbers up -- in the process it runs out of registers and starts spilling (in essense copying the doubles from one area of memory to another). After that, it proceeds to add the numbers up. But why not add the numbers into 1 register directly? Clearly this is what the C code is doing -- nothing could have been more explicit. The really strange thing, is that in the assingment to p[i] is removed (line marked with "xxx..."), then the code produced is optimal and exactly what one expects. I show this result in "Output B" where you get a beatiful sequence of addsd into register xmm2. It's all very strange and it points to some questionable decision making on the part of llvm. I tried different versions of the sum() function (elliminating the loop for example) but it does not help. Another observation is that the loop variable i (in foo) must be involved: if one does *p = 5 (instead of p[i] = 5), the problem also goes away. I would appreciate some advice on how to get around this problem. Thank you for any help, Brent double sum( double* v, int v_siz ) { double sum = 0.0; int i = 0; for (; i != v_siz; ++i) sum += v[i]; return sum; } double foo(double *x, int *p, int k) { double s = 0.0; for (int i = 0; i != k;++i) { s += sum(&x[i], 18); p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx } return s; } ====== Output A ==========================foo: # @foo .Ltmp12: .cfi_startproc # BB#0: pushl %ebx .Ltmp13: .cfi_def_cfa_offset 8 pushl %edi .Ltmp14: .cfi_def_cfa_offset 12 pushl %esi .Ltmp15: .cfi_def_cfa_offset 16 subl $88, %esp .Ltmp16: .cfi_def_cfa_offset 104 .Ltmp17: .cfi_offset %esi, -16 .Ltmp18: .cfi_offset %edi, -12 .Ltmp19: .cfi_offset %ebx, -8 pxor %xmm0, %xmm0 movl 112(%esp), %eax testl %eax, %eax je .LBB1_3 # BB#1: xorl %ebx, %ebx movl 108(%esp), %ecx movl 104(%esp), %edx xorl %esi, %esi .align 16, 0x90 .LBB1_2: # %.lr.ph.i # =>This Inner Loop Header: Depth=1 movsd (%edx,%ebx,8), %xmm2 addsd .LCPI1_0, %xmm2 movsd 16(%edx,%ebx,8), %xmm1 movsd %xmm1, (%esp) # 8-byte Spill movl %ebx, %edi addl $1, %edi addsd (%edx,%edi,8), %xmm2 movsd 136(%edx,%ebx,8), %xmm1 movsd %xmm1, 72(%esp) # 8-byte Spill movsd 128(%edx,%ebx,8), %xmm1 movsd %xmm1, 64(%esp) # 8-byte Spill movsd 120(%edx,%ebx,8), %xmm1 movsd %xmm1, 56(%esp) # 8-byte Spill movsd 112(%edx,%ebx,8), %xmm1 movsd %xmm1, 48(%esp) # 8-byte Spill movsd 104(%edx,%ebx,8), %xmm1 movsd %xmm1, 40(%esp) # 8-byte Spill movsd 96(%edx,%ebx,8), %xmm1 movsd %xmm1, 32(%esp) # 8-byte Spill movsd 88(%edx,%ebx,8), %xmm1 movsd %xmm1, 24(%esp) # 8-byte Spill movsd 80(%edx,%ebx,8), %xmm1 movsd %xmm1, 16(%esp) # 8-byte Spill movsd 72(%edx,%ebx,8), %xmm1 movsd %xmm1, 8(%esp) # 8-byte Spill movsd 64(%edx,%ebx,8), %xmm7 movsd 56(%edx,%ebx,8), %xmm1 movsd 48(%edx,%ebx,8), %xmm3 movsd 40(%edx,%ebx,8), %xmm4 movsd 32(%edx,%ebx,8), %xmm5 movsd 24(%edx,%ebx,8), %xmm6 movl $5, (%ecx,%ebx,4) addsd (%esp), %xmm2 # 8-byte Folded Reload addsd %xmm6, %xmm2 addsd %xmm5, %xmm2 addsd %xmm4, %xmm2 addsd %xmm3, %xmm2 addsd %xmm1, %xmm2 addsd %xmm7, %xmm2 addsd 8(%esp), %xmm2 # 8-byte Folded Reload addsd 16(%esp), %xmm2 # 8-byte Folded Reload addsd 24(%esp), %xmm2 # 8-byte Folded Reload addsd 32(%esp), %xmm2 # 8-byte Folded Reload addsd 40(%esp), %xmm2 # 8-byte Folded Reload addsd 48(%esp), %xmm2 # 8-byte Folded Reload addsd 56(%esp), %xmm2 # 8-byte Folded Reload addsd 64(%esp), %xmm2 # 8-byte Folded Reload addsd 72(%esp), %xmm2 # 8-byte Folded Reload addsd %xmm2, %xmm0 adcl $0, %esi cmpl %eax, %edi movl %edi, %ebx jne .LBB1_2 .LBB1_3: # %._crit_edge movsd %xmm0, 80(%esp) fldl 80(%esp) addl $88, %esp popl %esi popl %edi popl %ebx ret .Ltmp20: .size foo, .Ltmp20-foo .Ltmp21: .cfi_endproc .Leh_func_end1: ====== Output B ========================== foo: # @foo .Ltmp11: .cfi_startproc # BB#0: pushl %edi .Ltmp12: .cfi_def_cfa_offset 8 pushl %esi .Ltmp13: .cfi_def_cfa_offset 12 subl $12, %esp .Ltmp14: .cfi_def_cfa_offset 24 .Ltmp15: .cfi_offset %esi, -12 .Ltmp16: .cfi_offset %edi, -8 pxor %xmm0, %xmm0 movl 32(%esp), %eax testl %eax, %eax je .LBB1_3 # BB#1: xorl %esi, %esi movl 24(%esp), %ecx pxor %xmm1, %xmm1 xorl %edx, %edx .align 16, 0x90 .LBB1_2: # %.lr.ph.i # =>This Inner Loop Header: Depth=1 movsd (%ecx,%esi,8), %xmm2 addsd %xmm1, %xmm2 movl %esi, %edi addl $1, %edi addsd (%ecx,%edi,8), %xmm2 addsd 16(%ecx,%esi,8), %xmm2 addsd 24(%ecx,%esi,8), %xmm2 addsd 32(%ecx,%esi,8), %xmm2 addsd 40(%ecx,%esi,8), %xmm2 addsd 48(%ecx,%esi,8), %xmm2 addsd 56(%ecx,%esi,8), %xmm2 addsd 64(%ecx,%esi,8), %xmm2 addsd 72(%ecx,%esi,8), %xmm2 addsd 80(%ecx,%esi,8), %xmm2 addsd 88(%ecx,%esi,8), %xmm2 addsd 96(%ecx,%esi,8), %xmm2 addsd 104(%ecx,%esi,8), %xmm2 addsd 112(%ecx,%esi,8), %xmm2 addsd 120(%ecx,%esi,8), %xmm2 addsd 128(%ecx,%esi,8), %xmm2 addsd 136(%ecx,%esi,8), %xmm2 addsd %xmm2, %xmm0 adcl $0, %edx cmpl %eax, %edi movl %edi, %esi jne .LBB1_2 .LBB1_3: # %._crit_edge movsd %xmm0, (%esp) fldl (%esp) addl $12, %esp popl %esi popl %edi ret _______________________________________________ LLVM Developers mailing list LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
It would be unless one actually wanted the unroll to happen. Thank you for the suggestion though. I have filed a bug (http://llvm.org/bugs/show_bug.cgi?id=12392) so hopefully one of the developers can find the bug or suggest a work around that avoids the issue. Brent On Thu, Apr 5, 2012 at 7:36 PM, Patrik Hägglund H <patrik.h.hagglund at ericsson.com> wrote:> I don't know much about this, but maybe -mllvm -unroll-count=1 can be used as a workaround? > > /Patrik Hägglund > > -----Original Message----- > From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On Behalf Of Brent Walker > Sent: den 28 mars 2012 03:18 > To: llvmdev > Subject: [LLVMdev] Suboptimal code due to excessive spilling > > Hi, > > I have run into the following strange behavior and wanted to ask for > some advice. For the C program below, function sum() gets inlined in > foo() but the code generated looks very suboptimal (the code is an > extract from a larger program). > > Below I show the 32-bit x86 assembly as produced by the demo page on > the llvm home page ("Output A"). As you can see from the assembly, > after sum() is inlined and the loop unrolled, the generated code > loads all values of array v (aka &x[i]) into registers before adding > any numbers up -- in the process it runs out of registers and starts > spilling (in essense copying the doubles from one area of memory to > another). After that, it proceeds to add the numbers up. > > But why not add the numbers into 1 register directly? Clearly this is > what the C code is doing -- nothing could have been more explicit. > The really strange thing, is that in the assingment to p[i] is removed > (line marked with "xxx..."), then the code produced is optimal and > exactly what one expects. I show this result in "Output B" where you > get a beatiful sequence of addsd into register xmm2. > > It's all very strange and it points to some questionable decision > making on the part of llvm. I tried different versions of the sum() > function (elliminating the loop for example) but it does not help. > Another observation is that the loop variable i (in foo) must be > involved: if one does *p = 5 (instead of p[i] = 5), the problem also > goes away. > > I would appreciate some advice on how to get around this problem. > > Thank you for any help, > Brent > > > double sum( double* v, int v_siz ) > { > double sum = 0.0; > int i = 0; > > for (; i != v_siz; ++i) > sum += v[i]; > > return sum; > } > > double foo(double *x, int *p, int k) > { > double s = 0.0; > for (int i = 0; i != k;++i) > { > s += sum(&x[i], 18); > p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx > } > return s; > } > > ====== Output A =====> =====================> foo: # @foo > .Ltmp12: > .cfi_startproc > # BB#0: > pushl %ebx > .Ltmp13: > .cfi_def_cfa_offset 8 > pushl %edi > .Ltmp14: > .cfi_def_cfa_offset 12 > pushl %esi > .Ltmp15: > .cfi_def_cfa_offset 16 > subl $88, %esp > .Ltmp16: > .cfi_def_cfa_offset 104 > .Ltmp17: > .cfi_offset %esi, -16 > .Ltmp18: > .cfi_offset %edi, -12 > .Ltmp19: > .cfi_offset %ebx, -8 > pxor %xmm0, %xmm0 > movl 112(%esp), %eax > testl %eax, %eax > je .LBB1_3 > # BB#1: > xorl %ebx, %ebx > movl 108(%esp), %ecx > movl 104(%esp), %edx > xorl %esi, %esi > .align 16, 0x90 > .LBB1_2: # %.lr.ph.i > # =>This Inner Loop Header: Depth=1 > movsd (%edx,%ebx,8), %xmm2 > addsd .LCPI1_0, %xmm2 > movsd 16(%edx,%ebx,8), %xmm1 > movsd %xmm1, (%esp) # 8-byte Spill > movl %ebx, %edi > addl $1, %edi > addsd (%edx,%edi,8), %xmm2 > movsd 136(%edx,%ebx,8), %xmm1 > movsd %xmm1, 72(%esp) # 8-byte Spill > movsd 128(%edx,%ebx,8), %xmm1 > movsd %xmm1, 64(%esp) # 8-byte Spill > movsd 120(%edx,%ebx,8), %xmm1 > movsd %xmm1, 56(%esp) # 8-byte Spill > movsd 112(%edx,%ebx,8), %xmm1 > movsd %xmm1, 48(%esp) # 8-byte Spill > movsd 104(%edx,%ebx,8), %xmm1 > movsd %xmm1, 40(%esp) # 8-byte Spill > movsd 96(%edx,%ebx,8), %xmm1 > movsd %xmm1, 32(%esp) # 8-byte Spill > movsd 88(%edx,%ebx,8), %xmm1 > movsd %xmm1, 24(%esp) # 8-byte Spill > movsd 80(%edx,%ebx,8), %xmm1 > movsd %xmm1, 16(%esp) # 8-byte Spill > movsd 72(%edx,%ebx,8), %xmm1 > movsd %xmm1, 8(%esp) # 8-byte Spill > movsd 64(%edx,%ebx,8), %xmm7 > movsd 56(%edx,%ebx,8), %xmm1 > movsd 48(%edx,%ebx,8), %xmm3 > movsd 40(%edx,%ebx,8), %xmm4 > movsd 32(%edx,%ebx,8), %xmm5 > movsd 24(%edx,%ebx,8), %xmm6 > movl $5, (%ecx,%ebx,4) > addsd (%esp), %xmm2 # 8-byte Folded Reload > addsd %xmm6, %xmm2 > addsd %xmm5, %xmm2 > addsd %xmm4, %xmm2 > addsd %xmm3, %xmm2 > addsd %xmm1, %xmm2 > addsd %xmm7, %xmm2 > addsd 8(%esp), %xmm2 # 8-byte Folded Reload > addsd 16(%esp), %xmm2 # 8-byte Folded Reload > addsd 24(%esp), %xmm2 # 8-byte Folded Reload > addsd 32(%esp), %xmm2 # 8-byte Folded Reload > addsd 40(%esp), %xmm2 # 8-byte Folded Reload > addsd 48(%esp), %xmm2 # 8-byte Folded Reload > addsd 56(%esp), %xmm2 # 8-byte Folded Reload > addsd 64(%esp), %xmm2 # 8-byte Folded Reload > addsd 72(%esp), %xmm2 # 8-byte Folded Reload > addsd %xmm2, %xmm0 > adcl $0, %esi > cmpl %eax, %edi > movl %edi, %ebx > jne .LBB1_2 > .LBB1_3: # %._crit_edge > movsd %xmm0, 80(%esp) > fldl 80(%esp) > addl $88, %esp > popl %esi > popl %edi > popl %ebx > ret > .Ltmp20: > .size foo, .Ltmp20-foo > .Ltmp21: > .cfi_endproc > .Leh_func_end1: > > > ====== Output B =====> =====================> > foo: # @foo > .Ltmp11: > .cfi_startproc > # BB#0: > pushl %edi > .Ltmp12: > .cfi_def_cfa_offset 8 > pushl %esi > .Ltmp13: > .cfi_def_cfa_offset 12 > subl $12, %esp > .Ltmp14: > .cfi_def_cfa_offset 24 > .Ltmp15: > .cfi_offset %esi, -12 > .Ltmp16: > .cfi_offset %edi, -8 > pxor %xmm0, %xmm0 > movl 32(%esp), %eax > testl %eax, %eax > je .LBB1_3 > # BB#1: > xorl %esi, %esi > movl 24(%esp), %ecx > pxor %xmm1, %xmm1 > xorl %edx, %edx > .align 16, 0x90 > .LBB1_2: # %.lr.ph.i > # =>This Inner Loop Header: Depth=1 > movsd (%ecx,%esi,8), %xmm2 > addsd %xmm1, %xmm2 > movl %esi, %edi > addl $1, %edi > addsd (%ecx,%edi,8), %xmm2 > addsd 16(%ecx,%esi,8), %xmm2 > addsd 24(%ecx,%esi,8), %xmm2 > addsd 32(%ecx,%esi,8), %xmm2 > addsd 40(%ecx,%esi,8), %xmm2 > addsd 48(%ecx,%esi,8), %xmm2 > addsd 56(%ecx,%esi,8), %xmm2 > addsd 64(%ecx,%esi,8), %xmm2 > addsd 72(%ecx,%esi,8), %xmm2 > addsd 80(%ecx,%esi,8), %xmm2 > addsd 88(%ecx,%esi,8), %xmm2 > addsd 96(%ecx,%esi,8), %xmm2 > addsd 104(%ecx,%esi,8), %xmm2 > addsd 112(%ecx,%esi,8), %xmm2 > addsd 120(%ecx,%esi,8), %xmm2 > addsd 128(%ecx,%esi,8), %xmm2 > addsd 136(%ecx,%esi,8), %xmm2 > addsd %xmm2, %xmm0 > adcl $0, %edx > cmpl %eax, %edi > movl %edi, %esi > jne .LBB1_2 > .LBB1_3: # %._crit_edge > movsd %xmm0, (%esp) > fldl (%esp) > addl $12, %esp > popl %esi > popl %edi > ret > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Maybe Matching Threads
- [LLVMdev] Suboptimal code due to excessive spilling
- [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm
- [LLVMdev] SIMD instructions and memory alignment on X86
- [LLVMdev] How can I compile a c source file to use SSE2 Data Movement Instructions?
- [test-suite] making polybench/symm succeed with "-Ofast" and "-ffp-contract=on"