thr3ads.net - llvm dev - [llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm [Mar 2017]

If this information is useful, please help other people find it:
Share via:

Ramkumar Ramachandra via llvm-dev

2017-Mar-01 18:17 UTC

[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Hi,


We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT
and have isolated working.ll and broken.ll after middle-end optimizations -- in
the block merge128, notice that broken.ll has a fcmp une comparison to zero and
a jump based on that branch:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31020 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31020, i64* %593, align 1
  %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32
39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32
39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32
39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32
149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32
34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double
%_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140
  %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32
35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32
154
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32
34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32
37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00
  %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  br i1 %603, label %true73, label %false74

Now, in broken.asm, notice the same merge128 is missing the branch instruction:

.LBB6_55:                               # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 176(%rsp), %rax
movsd 5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq 184(%rsp), %rax
xorpd %xmm1, %xmm1
ucomisd %xmm1, %xmm0
movq 672(%rax), %rcx
movq %rcx, 200(%rsp)
movd %rcx, %xmm0
addsd 120(%rax), %xmm0
movq 176(%rsp), %rcx
mulsd 5680(%rcx), %xmm0
movsd %xmm0, 768(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero

We know that this is the right instruction to be looking at, because we can line
it up with working.ll and working.asm:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0,
i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31025 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0,
i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31025, i64* %593, align 1
  %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0,
i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0,
i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0,
i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0,
i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0,
i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double
%_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140
  %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0,
i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0,
i32 155
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0,
i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0,
i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0,
i32 154
  %_rtP__Switch_Threshold = load double, double* %603, align 1
  %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold
  %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  br i1 %604, label %true73, label %false74

working.ll is a slightly different model from broken.ll, in that it loads the
"zero value" from memory and does fcmp ogt instead of fcmp une.
Otherwise, they're the same. Now, let's look at working.asm:


.LBB6_55:                               # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 176(%rsp), %rax
movsd 5656(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq 176(%rsp), %rax
ucomisd 5648(%rax), %xmm0
movq 184(%rsp), %rcx
jbe .LBB6_56
# BB#128:                               # %true73
movq 672(%rcx), %rdx
jmp .LBB6_129
.LBB6_56:                               # %false74
movq 696(%rcx), %rdx
.LBB6_129:                              # %merge129
movq %rdx, 200(%rsp)
movd %rdx, %xmm0
addsd 120(%rcx), %xmm0
mulsd 5688(%rax), %xmm0
movsd %xmm0, 768(%rcx)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero

Notice that the blocks true73 and false74 are completely absent in broken.asm.


If you want to generate this asm yourself, please use LLVM 3.8's llc on the
.ll files. For viewing convenience, please use a difftool to look at broken.ll
versus working.ll and broken.asm versus working.asm -- I've highlighted the
differences above at merge128.


Further, we have instrumented this code to print out the value of rtb_Sum3_737,
and found that it takes values other than zero, hitting both branches at
execution. We would like to know if the community is aware of this bug, and
which patch fixed it. Finally, see broken-latest.asm to see the output from the
latest llc -- the jump is present and the bug has been fixed:


.LBB6_99:                               # %merge128
movq 8(%rsp), %rcx
movq %rax, 728(%rcx)
movq 8(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 8(%rsp), %rax
movq $0, 744(%rax)
movq 8(%rsp), %rax
movq $0, 752(%rax)
movq 8(%rsp), %rax
movq $0, 760(%rax)
movq 16(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq 8(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 40(%rsp), %xmm1         # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 16(%rsp), %rax
movsd 5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq 8(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 32(%rsp), %xmm0         # xmm0 = mem[0],zero
movq 8(%rsp), %rax
xorpd %xmm1, %xmm1
ucomisd %xmm1, %xmm0
jne .LBB6_100
jnp .LBB6_101
.LBB6_100:                              # %true73
movq 672(%rax), %rcx
jmp .LBB6_102
.LBB6_101:                              # %false74
movq 696(%rax), %rcx


Thanks.


Ram
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0001.html>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: broken.asm
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0003.ksh>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: working.asm
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0004.ksh>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: broken.ll
Type: application/octet-stream
Size: 215087 bytes
Desc: broken.ll
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0002.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: working.ll
Type: application/octet-stream
Size: 220892 bytes
Desc: working.ll
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0003.obj>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: broken-latest.asm
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0005.ksh>

Robinson, Paul via llvm-dev

2017-Mar-06 20:49 UTC

head link

[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

LLVM 3.8 was released a year ago.  A lot of things have changed since then.
Are you able to reproduce the problem with the current HEAD, or with the LLVM
4.0 release candidate?
--paulr

From: llvm-dev [mailto:llvm-dev-bounces at lists.llvm.org] On Behalf Of Ramkumar
Ramachandra via llvm-dev
Sent: Wednesday, March 01, 2017 10:17 AM
To: llvm-dev at lists.llvm.org
Cc: Dale Martin; Ketan Surender
Subject: [llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is
present in ll, absent in asm


Hi,



We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT
and have isolated working.ll and broken.ll after middle-end optimizations -- in
the block merge128, notice that broken.ll has a fcmp une comparison to zero and
a jump based on that branch:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31020 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31020, i64* %593, align 1
  %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32
39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32
39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32
39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32
149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32
34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double
%_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140
  %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32
35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32
154
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32
34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32
37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00
  %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  br i1 %603, label %true73, label %false74

Now, in broken.asm, notice the same merge128 is missing the branch instruction:

.LBB6_55:                               # %merge128
movq
184(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
184(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
184(%rsp), %rax
movq
$0, 744(%rax)
movq
184(%rsp), %rax
movq
$0, 752(%rax)
movq
184(%rsp), %rax
movq
$0, 760(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
176(%rsp), %rax
movsd
5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq
184(%rsp), %rax
xorpd
%xmm1, %xmm1
ucomisd
%xmm1, %xmm0
movq
672(%rax), %rcx
movq
%rcx, 200(%rsp)
movd
%rcx, %xmm0
addsd
120(%rax), %xmm0
movq
176(%rsp), %rcx
mulsd
5680(%rcx), %xmm0
movsd
%xmm0, 768(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero

We know that this is the right instruction to be looking at, because we can line
it up with working.ll and working.asm:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0,
i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31025 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0,
i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31025, i64* %593, align 1
  %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0,
i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0,
i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0,
i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0,
i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0,
i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double
%_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140
  %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0,
i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0,
i32 155
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0,
i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0,
i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0,
i32 154
  %_rtP__Switch_Threshold = load double, double* %603, align 1
  %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold
  %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  br i1 %604, label %true73, label %false74

working.ll is a slightly different model from broken.ll, in that it loads the
"zero value" from memory and does fcmp ogt instead of fcmp une.
Otherwise, they're the same. Now, let's look at working.asm:


.LBB6_55:                               # %merge128
movq
184(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
184(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
184(%rsp), %rax
movq
$0, 744(%rax)
movq
184(%rsp), %rax
movq
$0, 752(%rax)
movq
184(%rsp), %rax
movq
$0, 760(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
176(%rsp), %rax
movsd
5656(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq
176(%rsp), %rax
ucomisd
5648(%rax), %xmm0
movq
184(%rsp), %rcx
jbe
.LBB6_56
# BB#128:                               # %true73
movq
672(%rcx), %rdx
jmp
.LBB6_129
.LBB6_56:                               # %false74
movq
696(%rcx), %rdx
.LBB6_129:                              # %merge129
movq
%rdx, 200(%rsp)
movd
%rdx, %xmm0
addsd
120(%rcx), %xmm0
mulsd
5688(%rax), %xmm0
movsd
%xmm0, 768(%rcx)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero

Notice that the blocks true73 and false74 are completely absent in broken.asm.



If you want to generate this asm yourself, please use LLVM 3.8's llc on the
.ll files. For viewing convenience, please use a difftool to look at broken.ll
versus working.ll and broken.asm versus working.asm -- I've highlighted the
differences above at merge128.



Further, we have instrumented this code to print out the value of rtb_Sum3_737,
and found that it takes values other than zero, hitting both branches at
execution. We would like to know if the community is aware of this bug, and
which patch fixed it. Finally, see broken-latest.asm to see the output from the
latest llc -- the jump is present and the bug has been fixed:


.LBB6_99:                               # %merge128
movq
8(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
8(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
8(%rsp), %rax
movq
$0, 744(%rax)
movq
8(%rsp), %rax
movq
$0, 752(%rax)
movq
8(%rsp), %rax
movq
$0, 760(%rax)
movq
16(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
8(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
40(%rsp), %xmm1         # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
16(%rsp), %rax
movsd
5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq
8(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
32(%rsp), %xmm0         # xmm0 = mem[0],zero
movq
8(%rsp), %rax
xorpd
%xmm1, %xmm1
ucomisd
%xmm1, %xmm0
jne
.LBB6_100
jnp
.LBB6_101
.LBB6_100:                              # %true73
movq
672(%rax), %rcx
jmp
.LBB6_102
.LBB6_101:                              # %false74
movq
696(%rax), %rcx


Thanks.



Ram
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170306/a6e03c33/attachment-0001.html>

Ramkumar Ramachandra via llvm-dev

2017-Mar-06 20:51 UTC

head link

[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Nope, the bug seems to be fixed in the latest LLVM.


Ram

________________________________
From: Robinson, Paul <paul.robinson at sony.com>
Sent: Monday, March 6, 2017 3:49:40 PM
To: Ramkumar Ramachandra
Cc: Dale Martin; Ketan Surender; llvm-dev at lists.llvm.org
Subject: RE: [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in
ll, absent in asm

LLVM 3.8 was released a year ago.  A lot of things have changed since then.
Are you able to reproduce the problem with the current HEAD, or with the LLVM
4.0 release candidate?
--paulr

From: llvm-dev [mailto:llvm-dev-bounces at lists.llvm.org] On Behalf Of Ramkumar
Ramachandra via llvm-dev
Sent: Wednesday, March 01, 2017 10:17 AM
To: llvm-dev at lists.llvm.org
Cc: Dale Martin; Ketan Surender
Subject: [llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is
present in ll, absent in asm


Hi,



We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT
and have isolated working.ll and broken.ll after middle-end optimizations -- in
the block merge128, notice that broken.ll has a fcmp une comparison to zero and
a jump based on that branch:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31020 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31020, i64* %593, align 1
  %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32
39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32
39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32
39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32
149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32
34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double
%_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140
  %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32
35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32
154
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32
34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32
37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00
  %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  br i1 %603, label %true73, label %false74

Now, in broken.asm, notice the same merge128 is missing the branch instruction:

.LBB6_55:                               # %merge128
movq
184(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
184(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
184(%rsp), %rax
movq
$0, 744(%rax)
movq
184(%rsp), %rax
movq
$0, 752(%rax)
movq
184(%rsp), %rax
movq
$0, 760(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
176(%rsp), %rax
movsd
5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq
184(%rsp), %rax
xorpd
%xmm1, %xmm1
ucomisd
%xmm1, %xmm0
movq
672(%rax), %rcx
movq
%rcx, 200(%rsp)
movd
%rcx, %xmm0
addsd
120(%rax), %xmm0
movq
176(%rsp), %rcx
mulsd
5680(%rcx), %xmm0
movsd
%xmm0, 768(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero

We know that this is the right instruction to be looking at, because we can line
it up with working.ll and working.asm:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0,
i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31025 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0,
i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31025, i64* %593, align 1
  %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0,
i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0,
i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0,
i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0,
i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0,
i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double
%_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140
  %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0,
i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0,
i32 155
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0,
i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0,
i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0,
i32 154
  %_rtP__Switch_Threshold = load double, double* %603, align 1
  %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold
  %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  br i1 %604, label %true73, label %false74

working.ll is a slightly different model from broken.ll, in that it loads the
"zero value" from memory and does fcmp ogt instead of fcmp une.
Otherwise, they're the same. Now, let's look at working.asm:


.LBB6_55:                               # %merge128
movq
184(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
184(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
184(%rsp), %rax
movq
$0, 744(%rax)
movq
184(%rsp), %rax
movq
$0, 752(%rax)
movq
184(%rsp), %rax
movq
$0, 760(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
176(%rsp), %rax
movsd
5656(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq
176(%rsp), %rax
ucomisd
5648(%rax), %xmm0
movq
184(%rsp), %rcx
jbe
.LBB6_56
# BB#128:                               # %true73
movq
672(%rcx), %rdx
jmp
.LBB6_129
.LBB6_56:                               # %false74
movq
696(%rcx), %rdx
.LBB6_129:                              # %merge129
movq
%rdx, 200(%rsp)
movd
%rdx, %xmm0
addsd
120(%rcx), %xmm0
mulsd
5688(%rax), %xmm0
movsd
%xmm0, 768(%rcx)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero

Notice that the blocks true73 and false74 are completely absent in broken.asm.



If you want to generate this asm yourself, please use LLVM 3.8's llc on the
.ll files. For viewing convenience, please use a difftool to look at broken.ll
versus working.ll and broken.asm versus working.asm -- I've highlighted the
differences above at merge128.



Further, we have instrumented this code to print out the value of rtb_Sum3_737,
and found that it takes values other than zero, hitting both branches at
execution. We would like to know if the community is aware of this bug, and
which patch fixed it. Finally, see broken-latest.asm to see the output from the
latest llc -- the jump is present and the bug has been fixed:


.LBB6_99:                               # %merge128
movq
8(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
8(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
8(%rsp), %rax
movq
$0, 744(%rax)
movq
8(%rsp), %rax
movq
$0, 752(%rax)
movq
8(%rsp), %rax
movq
$0, 760(%rax)
movq
16(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
8(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
40(%rsp), %xmm1         # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
16(%rsp), %rax
movsd
5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq
8(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
32(%rsp), %xmm0         # xmm0 = mem[0],zero
movq
8(%rsp), %rax
xorpd
%xmm1, %xmm1
ucomisd
%xmm1, %xmm0
jne
.LBB6_100
jnp
.LBB6_101
.LBB6_100:                              # %true73
movq
672(%rax), %rcx
jmp
.LBB6_102
.LBB6_101:                              # %false74
movq
696(%rax), %rcx


Thanks.



Ram
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170306/e90c99b8/attachment.html>

llvm dev - Mar 2017 - [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm