Ramkumar Ramachandra via llvm-dev
2017-Mar-01 18:17 UTC
[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm
Hi, We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT and have isolated working.ll and broken.ll after middle-end optimizations -- in the block merge128, notice that broken.ll has a fcmp une comparison to zero and a jump based on that branch: merge128: ; preds = %true71, %false72 %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38 %591 = bitcast double* %590 to i64* %_rtB__Step31020 = load i64, i64* %591, align 1 %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39 %593 = bitcast [4 x double]* %592 to i64* store i64 %_rtB__Step31020, i64* %593, align 1 %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32 39, i64 1 store double 0.000000e+00, double* %594, align 1 %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32 39, i64 2 store double 0.000000e+00, double* %595, align 1 %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32 39, i64 3 store double 0.000000e+00, double* %596, align 1 %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8 %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32 149 %_rtP__Kp_Gain_n = load double, double* %597, align 1 %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 34, i64 0 %_rtB__Switch_k_el = load double, double* %598, align 1 %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140 %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 35, i64 0 store double %tmp141, double* %599, align 1 %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8 %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32 154 %_rtP__Ki_Gain_b = load double, double* %600, align 1 %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 34, i64 0 %_rtB__Switch_k_el735 = load double, double* %601, align 1 %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735 %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 37, i64 0 store double %tmp142, double* %602, align 1 %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8 %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00 %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 br i1 %603, label %true73, label %false74 Now, in broken.asm, notice the same merge128 is missing the branch instruction: .LBB6_55: # %merge128 movq 184(%rsp), %rcx movq %rax, 728(%rcx) movq 184(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 184(%rsp), %rax movq $0, 744(%rax) movq 184(%rsp), %rax movq $0, 752(%rax) movq 184(%rsp), %rax movq $0, 760(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 160(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 176(%rsp), %rax movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax xorpd %xmm1, %xmm1 ucomisd %xmm1, %xmm0 movq 672(%rax), %rcx movq %rcx, 200(%rsp) movd %rcx, %xmm0 addsd 120(%rax), %xmm0 movq 176(%rsp), %rcx mulsd 5680(%rcx), %xmm0 movsd %xmm0, 768(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero We know that this is the right instruction to be looking at, because we can line it up with working.ll and working.asm: merge128: ; preds = %true71, %false72 %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 38 %591 = bitcast double* %590 to i64* %_rtB__Step31025 = load i64, i64* %591, align 1 %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 39 %593 = bitcast [4 x double]* %592 to i64* store i64 %_rtB__Step31025, i64* %593, align 1 %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0, i32 39, i64 1 store double 0.000000e+00, double* %594, align 1 %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0, i32 39, i64 2 store double 0.000000e+00, double* %595, align 1 %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0, i32 39, i64 3 store double 0.000000e+00, double* %596, align 1 %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0, i32 149 %_rtP__Kp_Gain_n = load double, double* %597, align 1 %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 34, i64 0 %_rtB__Switch_k_el = load double, double* %598, align 1 %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140 %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 35, i64 0 store double %tmp141, double* %599, align 1 %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0, i32 155 %_rtP__Ki_Gain_b = load double, double* %600, align 1 %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 34, i64 0 %_rtB__Switch_k_el735 = load double, double* %601, align 1 %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735 %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 37, i64 0 store double %tmp142, double* %602, align 1 %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8 %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0, i32 154 %_rtP__Switch_Threshold = load double, double* %603, align 1 %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 br i1 %604, label %true73, label %false74 working.ll is a slightly different model from broken.ll, in that it loads the "zero value" from memory and does fcmp ogt instead of fcmp une. Otherwise, they're the same. Now, let's look at working.asm: .LBB6_55: # %merge128 movq 184(%rsp), %rcx movq %rax, 728(%rcx) movq 184(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 184(%rsp), %rax movq $0, 744(%rax) movq 184(%rsp), %rax movq $0, 752(%rax) movq 184(%rsp), %rax movq $0, 760(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 160(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 176(%rsp), %rax movsd 5656(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero movq 176(%rsp), %rax ucomisd 5648(%rax), %xmm0 movq 184(%rsp), %rcx jbe .LBB6_56 # BB#128: # %true73 movq 672(%rcx), %rdx jmp .LBB6_129 .LBB6_56: # %false74 movq 696(%rcx), %rdx .LBB6_129: # %merge129 movq %rdx, 200(%rsp) movd %rdx, %xmm0 addsd 120(%rcx), %xmm0 mulsd 5688(%rax), %xmm0 movsd %xmm0, 768(%rcx) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero Notice that the blocks true73 and false74 are completely absent in broken.asm. If you want to generate this asm yourself, please use LLVM 3.8's llc on the .ll files. For viewing convenience, please use a difftool to look at broken.ll versus working.ll and broken.asm versus working.asm -- I've highlighted the differences above at merge128. Further, we have instrumented this code to print out the value of rtb_Sum3_737, and found that it takes values other than zero, hitting both branches at execution. We would like to know if the community is aware of this bug, and which patch fixed it. Finally, see broken-latest.asm to see the output from the latest llc -- the jump is present and the bug has been fixed: .LBB6_99: # %merge128 movq 8(%rsp), %rcx movq %rax, 728(%rcx) movq 8(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 8(%rsp), %rax movq $0, 744(%rax) movq 8(%rsp), %rax movq $0, 752(%rax) movq 8(%rsp), %rax movq $0, 760(%rax) movq 16(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 40(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 16(%rsp), %rax movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 32(%rsp), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax xorpd %xmm1, %xmm1 ucomisd %xmm1, %xmm0 jne .LBB6_100 jnp .LBB6_101 .LBB6_100: # %true73 movq 672(%rax), %rcx jmp .LBB6_102 .LBB6_101: # %false74 movq 696(%rax), %rcx Thanks. Ram -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0001.html> -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: broken.asm URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0003.ksh> -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: working.asm URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0004.ksh> -------------- next part -------------- A non-text attachment was scrubbed... Name: broken.ll Type: application/octet-stream Size: 215087 bytes Desc: broken.ll URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0002.obj> -------------- next part -------------- A non-text attachment was scrubbed... Name: working.ll Type: application/octet-stream Size: 220892 bytes Desc: working.ll URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0003.obj> -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: broken-latest.asm URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0005.ksh>
Robinson, Paul via llvm-dev
2017-Mar-06 20:49 UTC
[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm
LLVM 3.8 was released a year ago. A lot of things have changed since then. Are you able to reproduce the problem with the current HEAD, or with the LLVM 4.0 release candidate? --paulr From: llvm-dev [mailto:llvm-dev-bounces at lists.llvm.org] On Behalf Of Ramkumar Ramachandra via llvm-dev Sent: Wednesday, March 01, 2017 10:17 AM To: llvm-dev at lists.llvm.org Cc: Dale Martin; Ketan Surender Subject: [llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm Hi, We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT and have isolated working.ll and broken.ll after middle-end optimizations -- in the block merge128, notice that broken.ll has a fcmp une comparison to zero and a jump based on that branch: merge128: ; preds = %true71, %false72 %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38 %591 = bitcast double* %590 to i64* %_rtB__Step31020 = load i64, i64* %591, align 1 %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39 %593 = bitcast [4 x double]* %592 to i64* store i64 %_rtB__Step31020, i64* %593, align 1 %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32 39, i64 1 store double 0.000000e+00, double* %594, align 1 %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32 39, i64 2 store double 0.000000e+00, double* %595, align 1 %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32 39, i64 3 store double 0.000000e+00, double* %596, align 1 %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8 %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32 149 %_rtP__Kp_Gain_n = load double, double* %597, align 1 %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 34, i64 0 %_rtB__Switch_k_el = load double, double* %598, align 1 %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140 %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 35, i64 0 store double %tmp141, double* %599, align 1 %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8 %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32 154 %_rtP__Ki_Gain_b = load double, double* %600, align 1 %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 34, i64 0 %_rtB__Switch_k_el735 = load double, double* %601, align 1 %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735 %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 37, i64 0 store double %tmp142, double* %602, align 1 %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8 %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00 %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 br i1 %603, label %true73, label %false74 Now, in broken.asm, notice the same merge128 is missing the branch instruction: .LBB6_55: # %merge128 movq 184(%rsp), %rcx movq %rax, 728(%rcx) movq 184(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 184(%rsp), %rax movq $0, 744(%rax) movq 184(%rsp), %rax movq $0, 752(%rax) movq 184(%rsp), %rax movq $0, 760(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 160(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 176(%rsp), %rax movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax xorpd %xmm1, %xmm1 ucomisd %xmm1, %xmm0 movq 672(%rax), %rcx movq %rcx, 200(%rsp) movd %rcx, %xmm0 addsd 120(%rax), %xmm0 movq 176(%rsp), %rcx mulsd 5680(%rcx), %xmm0 movsd %xmm0, 768(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero We know that this is the right instruction to be looking at, because we can line it up with working.ll and working.asm: merge128: ; preds = %true71, %false72 %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 38 %591 = bitcast double* %590 to i64* %_rtB__Step31025 = load i64, i64* %591, align 1 %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 39 %593 = bitcast [4 x double]* %592 to i64* store i64 %_rtB__Step31025, i64* %593, align 1 %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0, i32 39, i64 1 store double 0.000000e+00, double* %594, align 1 %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0, i32 39, i64 2 store double 0.000000e+00, double* %595, align 1 %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0, i32 39, i64 3 store double 0.000000e+00, double* %596, align 1 %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0, i32 149 %_rtP__Kp_Gain_n = load double, double* %597, align 1 %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 34, i64 0 %_rtB__Switch_k_el = load double, double* %598, align 1 %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140 %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 35, i64 0 store double %tmp141, double* %599, align 1 %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0, i32 155 %_rtP__Ki_Gain_b = load double, double* %600, align 1 %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 34, i64 0 %_rtB__Switch_k_el735 = load double, double* %601, align 1 %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735 %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 37, i64 0 store double %tmp142, double* %602, align 1 %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8 %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0, i32 154 %_rtP__Switch_Threshold = load double, double* %603, align 1 %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 br i1 %604, label %true73, label %false74 working.ll is a slightly different model from broken.ll, in that it loads the "zero value" from memory and does fcmp ogt instead of fcmp une. Otherwise, they're the same. Now, let's look at working.asm: .LBB6_55: # %merge128 movq 184(%rsp), %rcx movq %rax, 728(%rcx) movq 184(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 184(%rsp), %rax movq $0, 744(%rax) movq 184(%rsp), %rax movq $0, 752(%rax) movq 184(%rsp), %rax movq $0, 760(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 160(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 176(%rsp), %rax movsd 5656(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero movq 176(%rsp), %rax ucomisd 5648(%rax), %xmm0 movq 184(%rsp), %rcx jbe .LBB6_56 # BB#128: # %true73 movq 672(%rcx), %rdx jmp .LBB6_129 .LBB6_56: # %false74 movq 696(%rcx), %rdx .LBB6_129: # %merge129 movq %rdx, 200(%rsp) movd %rdx, %xmm0 addsd 120(%rcx), %xmm0 mulsd 5688(%rax), %xmm0 movsd %xmm0, 768(%rcx) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero Notice that the blocks true73 and false74 are completely absent in broken.asm. If you want to generate this asm yourself, please use LLVM 3.8's llc on the .ll files. For viewing convenience, please use a difftool to look at broken.ll versus working.ll and broken.asm versus working.asm -- I've highlighted the differences above at merge128. Further, we have instrumented this code to print out the value of rtb_Sum3_737, and found that it takes values other than zero, hitting both branches at execution. We would like to know if the community is aware of this bug, and which patch fixed it. Finally, see broken-latest.asm to see the output from the latest llc -- the jump is present and the bug has been fixed: .LBB6_99: # %merge128 movq 8(%rsp), %rcx movq %rax, 728(%rcx) movq 8(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 8(%rsp), %rax movq $0, 744(%rax) movq 8(%rsp), %rax movq $0, 752(%rax) movq 8(%rsp), %rax movq $0, 760(%rax) movq 16(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 40(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 16(%rsp), %rax movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 32(%rsp), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax xorpd %xmm1, %xmm1 ucomisd %xmm1, %xmm0 jne .LBB6_100 jnp .LBB6_101 .LBB6_100: # %true73 movq 672(%rax), %rcx jmp .LBB6_102 .LBB6_101: # %false74 movq 696(%rax), %rcx Thanks. Ram -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170306/a6e03c33/attachment-0001.html>
Ramkumar Ramachandra via llvm-dev
2017-Mar-06 20:51 UTC
[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm
Nope, the bug seems to be fixed in the latest LLVM. Ram ________________________________ From: Robinson, Paul <paul.robinson at sony.com> Sent: Monday, March 6, 2017 3:49:40 PM To: Ramkumar Ramachandra Cc: Dale Martin; Ketan Surender; llvm-dev at lists.llvm.org Subject: RE: [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm LLVM 3.8 was released a year ago. A lot of things have changed since then. Are you able to reproduce the problem with the current HEAD, or with the LLVM 4.0 release candidate? --paulr From: llvm-dev [mailto:llvm-dev-bounces at lists.llvm.org] On Behalf Of Ramkumar Ramachandra via llvm-dev Sent: Wednesday, March 01, 2017 10:17 AM To: llvm-dev at lists.llvm.org Cc: Dale Martin; Ketan Surender Subject: [llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm Hi, We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT and have isolated working.ll and broken.ll after middle-end optimizations -- in the block merge128, notice that broken.ll has a fcmp une comparison to zero and a jump based on that branch: merge128: ; preds = %true71, %false72 %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38 %591 = bitcast double* %590 to i64* %_rtB__Step31020 = load i64, i64* %591, align 1 %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39 %593 = bitcast [4 x double]* %592 to i64* store i64 %_rtB__Step31020, i64* %593, align 1 %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32 39, i64 1 store double 0.000000e+00, double* %594, align 1 %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32 39, i64 2 store double 0.000000e+00, double* %595, align 1 %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32 39, i64 3 store double 0.000000e+00, double* %596, align 1 %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8 %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32 149 %_rtP__Kp_Gain_n = load double, double* %597, align 1 %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 34, i64 0 %_rtB__Switch_k_el = load double, double* %598, align 1 %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140 %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 35, i64 0 store double %tmp141, double* %599, align 1 %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8 %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32 154 %_rtP__Ki_Gain_b = load double, double* %600, align 1 %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 34, i64 0 %_rtB__Switch_k_el735 = load double, double* %601, align 1 %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735 %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 37, i64 0 store double %tmp142, double* %602, align 1 %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8 %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00 %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8 br i1 %603, label %true73, label %false74 Now, in broken.asm, notice the same merge128 is missing the branch instruction: .LBB6_55: # %merge128 movq 184(%rsp), %rcx movq %rax, 728(%rcx) movq 184(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 184(%rsp), %rax movq $0, 744(%rax) movq 184(%rsp), %rax movq $0, 752(%rax) movq 184(%rsp), %rax movq $0, 760(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 160(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 176(%rsp), %rax movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax xorpd %xmm1, %xmm1 ucomisd %xmm1, %xmm0 movq 672(%rax), %rcx movq %rcx, 200(%rsp) movd %rcx, %xmm0 addsd 120(%rax), %xmm0 movq 176(%rsp), %rcx mulsd 5680(%rcx), %xmm0 movsd %xmm0, 768(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero We know that this is the right instruction to be looking at, because we can line it up with working.ll and working.asm: merge128: ; preds = %true71, %false72 %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 38 %591 = bitcast double* %590 to i64* %_rtB__Step31025 = load i64, i64* %591, align 1 %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 39 %593 = bitcast [4 x double]* %592 to i64* store i64 %_rtB__Step31025, i64* %593, align 1 %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0, i32 39, i64 1 store double 0.000000e+00, double* %594, align 1 %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0, i32 39, i64 2 store double 0.000000e+00, double* %595, align 1 %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0, i32 39, i64 3 store double 0.000000e+00, double* %596, align 1 %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0, i32 149 %_rtP__Kp_Gain_n = load double, double* %597, align 1 %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 34, i64 0 %_rtB__Switch_k_el = load double, double* %598, align 1 %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140 %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 35, i64 0 store double %tmp141, double* %599, align 1 %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0, i32 155 %_rtP__Ki_Gain_b = load double, double* %600, align 1 %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 34, i64 0 %_rtB__Switch_k_el735 = load double, double* %601, align 1 %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735 %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 37, i64 0 store double %tmp142, double* %602, align 1 %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8 %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8 %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0, i32 154 %_rtP__Switch_Threshold = load double, double* %603, align 1 %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8 br i1 %604, label %true73, label %false74 working.ll is a slightly different model from broken.ll, in that it loads the "zero value" from memory and does fcmp ogt instead of fcmp une. Otherwise, they're the same. Now, let's look at working.asm: .LBB6_55: # %merge128 movq 184(%rsp), %rcx movq %rax, 728(%rcx) movq 184(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 184(%rsp), %rax movq $0, 744(%rax) movq 184(%rsp), %rax movq $0, 752(%rax) movq 184(%rsp), %rax movq $0, 760(%rax) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 160(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 176(%rsp), %rax movsd 5656(%rax), %xmm0 # xmm0 = mem[0],zero movq 184(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero movq 176(%rsp), %rax ucomisd 5648(%rax), %xmm0 movq 184(%rsp), %rcx jbe .LBB6_56 # BB#128: # %true73 movq 672(%rcx), %rdx jmp .LBB6_129 .LBB6_56: # %false74 movq 696(%rcx), %rdx .LBB6_129: # %merge129 movq %rdx, 200(%rsp) movd %rdx, %xmm0 addsd 120(%rcx), %xmm0 mulsd 5688(%rax), %xmm0 movsd %xmm0, 768(%rcx) movq 176(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero Notice that the blocks true73 and false74 are completely absent in broken.asm. If you want to generate this asm yourself, please use LLVM 3.8's llc on the .ll files. For viewing convenience, please use a difftool to look at broken.ll versus working.ll and broken.asm versus working.asm -- I've highlighted the differences above at merge128. Further, we have instrumented this code to print out the value of rtb_Sum3_737, and found that it takes values other than zero, hitting both branches at execution. We would like to know if the community is aware of this bug, and which patch fixed it. Finally, see broken-latest.asm to see the output from the latest llc -- the jump is present and the bug has been fixed: .LBB6_99: # %merge128 movq 8(%rsp), %rcx movq %rax, 728(%rcx) movq 8(%rsp), %rax movq 728(%rax), %rcx movq %rcx, 736(%rax) movq 8(%rsp), %rax movq $0, 744(%rax) movq 8(%rsp), %rax movq $0, 752(%rax) movq 8(%rsp), %rax movq $0, 760(%rax) movq 16(%rsp), %rax movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax mulsd 648(%rax), %xmm0 movsd 40(%rsp), %xmm1 # 8-byte Reload # xmm1 = mem[0],zero addsd %xmm0, %xmm1 movsd %xmm1, 672(%rax) movq 16(%rsp), %rax movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax mulsd 648(%rax), %xmm0 movsd %xmm0, 704(%rax) movsd 32(%rsp), %xmm0 # xmm0 = mem[0],zero movq 8(%rsp), %rax xorpd %xmm1, %xmm1 ucomisd %xmm1, %xmm0 jne .LBB6_100 jnp .LBB6_101 .LBB6_100: # %true73 movq 672(%rax), %rcx jmp .LBB6_102 .LBB6_101: # %false74 movq 696(%rax), %rcx Thanks. Ram -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170306/e90c99b8/attachment.html>