Joan Lluch via llvm-dev
2019-Jun-30 19:04 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
Hi All, The following code : void hexagon2( int *a, int *res ) { int i = 100; while ( i-- ) { *res++ = *a++; } } gets compiled as a sub-optimal Software loop on LLVM 9.0 instead of a Hardware loop, whereas it was compiled as a Hardware Loop in LLVM 7.0. This is the final assembly code generated by LLVM 9.0 : .text .file "main.c" .globl hexagon2 // -- Begin function hexagon2 .p2align 2 .type hexagon2, at function hexagon2: // @hexagon2 // %bb.0: // %entry.old { p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 r2 = r0 allocframe(#0) } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL // %bb.1: // %entry.old { r0 = sub(r1,r0) } // encoding: [0x00,0xc1,0x20,0xf3] { if (p0.new) jump:nt .LBB0_5 p0 = cmp.gt(r0,#399) } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: { r0 = #-100 } // encoding: [0x80,0xf3,0xdf,0x78] .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r3 = add(r0,#1) r4 = memw(r2++#4) memw(r1++#4) = r4.new } // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab] { p0 = cmp.gtu(r0,r3); if (!p0.new) jump:t .LBB0_3 r0 = r3 } // encoding: [A,0x63'A',0x40'A',0x15'A',0x00,0xc0,0x63,0x70] // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B9_PCREL // %bb.4: // %while.end { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .LBB0_5: // %while.body.rtli { call memmove r1:0 = combine(r2,r1) r2 = #400 } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .Lfunc_end0: .size hexagon2, .Lfunc_end0-hexagon2 // -- End function This is the assembly code generated by LLVM 7.0 : .text .file "main.c" .globl hexagon2 // -- Begin function hexagon2 .p2align 2 .type hexagon2, at function hexagon2: // @hexagon2 // %bb.0: // %entry.old { p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 r2 = r0 allocframe(#0) } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL // %bb.1: // %entry.old { r0 = sub(r1,r0) } // encoding: [0x00,0xc1,0x20,0xf3] { if (p0.new) jump:nt .LBB0_5 p0 = cmp.gt(r0,#399) } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: // %while.body.preheader { loop0(.LBB0_3,#100) } // encoding: [0x20'A',0xc0'A',0x03'A',0x69'A'] // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B7_PCREL .Ltmp0: // Block address taken .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r0 = memw(r2++#4) memw(r1++#4) = r0.new } :endloop0 // encoding: [0x20,0x80,0x82,0x9b,0x08,0xd2,0xa1,0xab] // %bb.4: // %while.end { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .LBB0_5: // %while.body.rtli { call memmove r1:0 = combine(r2,r1) r2 = #400 } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .Lfunc_end0: .size hexagon2, .Lfunc_end0-hexagon2 // -- End function The code generated by LLVM 7.0 is better than LLVM 9.0 because 9.0 did not made use of Hardware loops. This is in my opinion a bad regression from some earlier version. This is not an isolated case, more cases of the same LLVM 9 ‘defect’ are easy to find. I have investigated the issue and I identified the root cause of it, which is related with the initial use of the “llvm.uadd" intrinsic in LLVM 9.0 to increment the loop Induction Variable, instead of an “add” instruction like LLVM 7.0. This is the while.body excerpt after "CodeGen Prepare” in LLVM 9.0 while.body: ; preds = %entry.old, %while.body %lsr.iv = phi i32 [ %math, %while.body ], [ -100, %entry.old ] %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %entry.old ] %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %entry.old ] %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %lsr.iv, i32 1) %math = extractvalue { i32, i1 } %7, 0 %ov = extractvalue { i32, i1 } %7, 1 %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 br i1 %ov, label %while.end, label %while.body And this is the same excerpt on LLVM 7.0 while.body: ; preds = %while.body.preheader, %while.body %lsr.iv = phi i32 [ -100, %while.body.preheader ], [ %lsr.iv.next, %while.body ] %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %while.body.preheader ] %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %while.body.preheader ] %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 %lsr.iv.next = add nsw i32 %lsr.iv, 1 %tobool = icmp eq i32 %lsr.iv.next, 0 %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 br i1 %tobool, label %while.end, label %while.body LLVM 9 uses “llvm.uadd”. This finally prevents the “Hexagon Hardware Loops” pass to recognise a hardware loop pattern, resulting in sub-optimal code, specially compared with what LLVM 7.0 produces. The code (excerpt) just before the Hexagon Hardware Loops pass on LLVM 9 is this: bb.5: ; predecessors: %bb.1 successors: %bb.3(0x80000000); %bb.3(100.00%) %8:intregs = A2_tfrsi -100 J2_jump %bb.3, implicit-def $pc bb.3.while.body: ; predecessors: %bb.3, %bb.5 successors: %bb.4(0x04000000), %bb.3(0x7c000000); %bb.4(3.12%), %bb.3(96.88%) %0:intregs = PHI %8:intregs, %bb.5, %3:intregs, %bb.3 %1:intregs = PHI %7:intregs, %bb.5, %5:intregs, %bb.3 %2:intregs = PHI %6:intregs, %bb.5, %4:intregs, %bb.3 %13:intregs, %4:intregs = L2_loadri_pi %2:intregs(tied-def 1), 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) %5:intregs = S2_storeri_pi %1:intregs(tied-def 0), 4, %13:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) %3:intregs = A2_addi %0:intregs, 1 %14:predregs = C2_cmpgtu %0:intregs, %3:intregs J2_jumpf %14:predregs, %bb.3, implicit-def dead $pc J2_jump %bb.4, implicit-def dead $pc The same code on LLVM 7 is this: bb.2.while.body.preheader: ; predecessors: %bb.1 successors: %bb.4(0x80000000); %bb.4(200.00%) %11:intregs = A2_tfrsi -100 J2_jump %bb.4, implicit-def dead $pc bb.4.while.body: ; predecessors: %bb.2, %bb.4 successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%) %0:intregs = PHI %11:intregs, %bb.2, %3:intregs, %bb.4 %1:intregs = PHI %7:intregs, %bb.2, %5:intregs, %bb.4 %2:intregs = PHI %6:intregs, %bb.2, %4:intregs, %bb.4 %12:intregs, %4:intregs = L2_loadri_pi %2:intregs, 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) %5:intregs = S2_storeri_pi %1:intregs, 4, %12:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) %3:intregs = A2_addi %0:intregs, 1 %13:predregs = C2_cmpeqi %3:intregs, 0 J2_jumpf %13:predregs, %bb.4, implicit-def dead $pc J2_jump %bb.5, implicit-def dead $pc The differences above allow LLVM 7 to turn %13, %3, %11 into a hardware Loop as shown in the assembly code earlier in this message. However, LLVM 9 can’t identify a Hardware loop pattern due to the odd C2_cmpgtu instruction that gets generated. This instruction is a consequence of the introduction of the “llvm.addu” intrinsic that I showed earlier. I am presenting here the case of Hexagon, but I suspect the same sub-optimal code may happen for the PowerPC (not checked yet). All the code excerpts were obtained with -Os flags I’m highly interested in getting in contact with someone familiar with the Hexagon/PowerPC targets regarding this subject. Any pointers to the right persons would be appreciated. My interest comes from the fact that I am proposing an improvement on the LSR pass that affects all targets and I need hardware loops to be properly generated in LLVM 9 like they used to be in LLVM 7. Thanks John. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190630/508bac91/attachment.html>
Sanjay Patel via llvm-dev
2019-Jul-01 13:04 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
Hi - I don't know much about Hexagon, but I'm probably responsible ( https://reviews.llvm.org/D57789 and follow-ons) for the CGP difference that you are seeing. There are a couple of ways to deal with this: 1. Adjust the target hook that controls this transform to return 'false' if there is a better alternative: /// Try to convert math with an overflow comparison into the corresponding DAG /// node operation. Targets may want to override this independently of whether /// the operation is legal/custom for the given type because it may obscure /// matching of other patterns. virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const { 2. Enhance the pattern-matching that forms a hardware loop to accept the overflow intrinsics. Either way, please file bugs and/or add some regression tests for the problem cases on these targets. If those had existed before, I would have noticed the problem and tried to solve it as part of the initial change. On Sun, Jun 30, 2019 at 1:04 PM Joan Lluch via llvm-dev < llvm-dev at lists.llvm.org> wrote:> Hi All, > > The following code : > > void hexagon2( int *a, int *res ) > { > int i = 100; > while ( i-- ) { > *res++ = *a++; > } > } > > gets compiled as a sub-optimal Software loop on LLVM 9.0 instead of a > Hardware loop, whereas it was compiled as a Hardware Loop in LLVM 7.0. > > > This is the final assembly code generated by LLVM 9.0 : > > > .text > .file "main.c" > .globl hexagon2 // -- Begin function hexagon2 > .p2align 2 > .type hexagon2, at function > hexagon2: // @hexagon2 > // %bb.0: // %entry.old > { > p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 > r2 = r0 > allocframe(#0) > } // encoding: > [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] > // fixup A - offset: 0, value: > .LBB0_5, kind: fixup_Hexagon_B9_PCREL > // %bb.1: // %entry.old > { > r0 = sub(r1,r0) > } // encoding: [0x00,0xc1,0x20,0xf3] > { > if (p0.new) jump:nt .LBB0_5 > p0 = cmp.gt(r0,#399) > } // encoding: > [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] > // fixup A - offset: 0, value: > .LBB0_5, kind: fixup_Hexagon_B15_PCREL > // %bb.2: > { > r0 = #-100 > } // encoding: [0x80,0xf3,0xdf,0x78] > .LBB0_3: // %while.body > // =>This Inner Loop Header: > Depth=1 > { > r3 = add(r0,#1) > r4 = memw(r2++#4) > memw(r1++#4) = r4.new > } // encoding: > [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab] > { > p0 = cmp.gtu(r0,r3); if (!p0.new) jump:t .LBB0_3 > r0 = r3 > } // encoding: > [A,0x63'A',0x40'A',0x15'A',0x00,0xc0,0x63,0x70] > // fixup A - offset: 0, value: > .LBB0_3, kind: fixup_Hexagon_B9_PCREL > // %bb.4: // %while.end > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .LBB0_5: // %while.body.rtli > { > call memmove > r1:0 = combine(r2,r1) > r2 = #400 > } // encoding: > [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] > // fixup A - offset: 0, value: > memmove, kind: fixup_Hexagon_B22_PCREL > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .Lfunc_end0: > .size hexagon2, .Lfunc_end0-hexagon2 > // -- End function > > > > This is the assembly code generated by LLVM 7.0 : > > .text > .file "main.c" > .globl hexagon2 // -- Begin function hexagon2 > .p2align 2 > .type hexagon2, at function > hexagon2: // @hexagon2 > // %bb.0: // %entry.old > { > p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 > r2 = r0 > allocframe(#0) > } // encoding: > [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] > // fixup A - offset: 0, value: > .LBB0_5, kind: fixup_Hexagon_B9_PCREL > // %bb.1: // %entry.old > { > r0 = sub(r1,r0) > } // encoding: [0x00,0xc1,0x20,0xf3] > { > if (p0.new) jump:nt .LBB0_5 > p0 = cmp.gt(r0,#399) > } // encoding: > [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] > // fixup A - offset: 0, value: > .LBB0_5, kind: fixup_Hexagon_B15_PCREL > // %bb.2: // %while.body.preheader > { > loop0(.LBB0_3,#100) > } // encoding: > [0x20'A',0xc0'A',0x03'A',0x69'A'] > // fixup A - offset: 0, value: > .LBB0_3, kind: fixup_Hexagon_B7_PCREL > .Ltmp0: // Block address taken > .LBB0_3: // %while.body > // =>This Inner Loop Header: > Depth=1 > { > r0 = memw(r2++#4) > memw(r1++#4) = r0.new > } :endloop0 // encoding: > [0x20,0x80,0x82,0x9b,0x08,0xd2,0xa1,0xab] > // %bb.4: // %while.end > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .LBB0_5: // %while.body.rtli > { > call memmove > r1:0 = combine(r2,r1) > r2 = #400 > } // encoding: > [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] > // fixup A - offset: 0, value: > memmove, kind: fixup_Hexagon_B22_PCREL > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .Lfunc_end0: > .size hexagon2, .Lfunc_end0-hexagon2 > // -- End function > > > The code generated by LLVM 7.0 is better than LLVM 9.0 because 9.0 did not > made use of Hardware loops. This is in my opinion a bad regression from > some earlier version. This is not an isolated case, more cases of the same > LLVM 9 ‘defect’ are easy to find. > > I have investigated the issue and I identified the root cause of it, which > is related with the initial use of the “llvm.uadd" intrinsic in LLVM 9.0 to > increment the loop Induction Variable, instead of an “add” instruction like > LLVM 7.0. > > > This is the while.body excerpt after "CodeGen Prepare” in LLVM 9.0 > > while.body: ; preds = %entry.old, > %while.body > %lsr.iv = phi i32 [ %math, %while.body ], [ -100, %entry.old ] > %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %entry.old ] > %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %entry.old ] > %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 > store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 > %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %lsr.iv, i32 1) > %math = extractvalue { i32, i1 } %7, 0 > %ov = extractvalue { i32, i1 } %7, 1 > %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 > %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 > br i1 %ov, label %while.end, label %while.body > > > And this is the same excerpt on LLVM 7.0 > > while.body: ; preds = % > while.body.preheader, %while.body > %lsr.iv = phi i32 [ -100, %while.body.preheader ], [ %lsr.iv.next, % > while.body ] > %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, % > while.body.preheader ] > %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, % > while.body.preheader ] > %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 > store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 > %lsr.iv.next = add nsw i32 %lsr.iv, 1 > %tobool = icmp eq i32 %lsr.iv.next, 0 > %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 > %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 > br i1 %tobool, label %while.end, label %while.body > > > LLVM 9 uses “llvm.uadd”. This finally prevents the “Hexagon Hardware > Loops” pass to recognise a hardware loop pattern, resulting in sub-optimal > code, specially compared with what LLVM 7.0 produces. > > The code (excerpt) just before the Hexagon Hardware Loops pass on LLVM 9 > is this: > > bb.5: > ; predecessors: %bb.1 > successors: %bb.3(0x80000000); %bb.3(100.00%) > > %8:intregs = A2_tfrsi -100 > J2_jump %bb.3, implicit-def $pc > > bb.3.while.body: > ; predecessors: %bb.3, %bb.5 > successors: %bb.4(0x04000000), %bb.3(0x7c000000); %bb.4(3.12%), %bb.3 > (96.88%) > > %0:intregs = PHI %8:intregs, %bb.5, %3:intregs, %bb.3 > %1:intregs = PHI %7:intregs, %bb.5, %5:intregs, %bb.3 > %2:intregs = PHI %6:intregs, %bb.5, %4:intregs, %bb.3 > %13:intregs, %4:intregs = L2_loadri_pi %2:intregs(tied-def 1), 4 :: > (load 4 from %ir.a.addr.03, !tbaa !2) > %5:intregs = S2_storeri_pi %1:intregs(tied-def 0), 4, %13:intregs :: > (store 4 into %ir.res.addr.04, !tbaa !2) > %3:intregs = A2_addi %0:intregs, 1 > %14:predregs = C2_cmpgtu %0:intregs, %3:intregs > J2_jumpf %14:predregs, %bb.3, implicit-def dead $pc > J2_jump %bb.4, implicit-def dead $pc > > > The same code on LLVM 7 is this: > > bb.2.while.body.preheader: > ; predecessors: %bb.1 > successors: %bb.4(0x80000000); %bb.4(200.00%) > > %11:intregs = A2_tfrsi -100 > J2_jump %bb.4, implicit-def dead $pc > > bb.4.while.body: > ; predecessors: %bb.2, %bb.4 > successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4 > (96.88%) > > %0:intregs = PHI %11:intregs, %bb.2, %3:intregs, %bb.4 > %1:intregs = PHI %7:intregs, %bb.2, %5:intregs, %bb.4 > %2:intregs = PHI %6:intregs, %bb.2, %4:intregs, %bb.4 > %12:intregs, %4:intregs = L2_loadri_pi %2:intregs, 4 :: (load 4 from > %ir.a.addr.03, !tbaa !2) > %5:intregs = S2_storeri_pi %1:intregs, 4, %12:intregs :: (store 4 into > %ir.res.addr.04, !tbaa !2) > %3:intregs = A2_addi %0:intregs, 1 > %13:predregs = C2_cmpeqi %3:intregs, 0 > J2_jumpf %13:predregs, %bb.4, implicit-def dead $pc > J2_jump %bb.5, implicit-def dead $pc > > > The differences above allow LLVM 7 to turn %13, %3, %11 into a hardware > Loop as shown in the assembly code earlier in this message. However, LLVM 9 > can’t identify a Hardware loop pattern due to the odd C2_cmpgtu instruction > that gets generated. This instruction is a consequence of the introduction > of the “llvm.addu” intrinsic that I showed earlier. > > I am presenting here the case of Hexagon, but I suspect the same > sub-optimal code may happen for the PowerPC (not checked yet). > > All the code excerpts were obtained with -Os flags > > I’m highly interested in getting in contact with someone familiar with the > Hexagon/PowerPC targets regarding this subject. Any pointers to the right > persons would be appreciated. My interest comes from the fact that I am > proposing an improvement on the LSR pass that affects all targets and I > need hardware loops to be properly generated in LLVM 9 like they used to be > in LLVM 7. > > Thanks > > John. > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190701/47ff0cc7/attachment-0001.html>
Krzysztof Parzyszek via llvm-dev
2019-Jul-01 15:51 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
The Hexagon part is fixed in r364790. -- Krzysztof Parzyszek kparzysz at quicinc.com<mailto:kparzysz at quicinc.com> LLVM compiler development From: llvm-dev <llvm-dev-bounces at lists.llvm.org> On Behalf Of Joan Lluch via llvm-dev Sent: Sunday, June 30, 2019 2:04 PM To: llvm-dev <llvm-dev at lists.llvm.org> Subject: [EXT] [llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic. Hi All, The following code : void hexagon2( int *a, int *res ) { int i = 100; while ( i-- ) { *res++ = *a++; } } gets compiled as a sub-optimal Software loop on LLVM 9.0 instead of a Hardware loop, whereas it was compiled as a Hardware Loop in LLVM 7.0. This is the final assembly code generated by LLVM 9.0 : .text .file "main.c" .globl hexagon2 // -- Begin function hexagon2 .p2align 2 .type hexagon2, at function hexagon2: // @hexagon2 // %bb.0: // %entry.old { p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 r2 = r0 allocframe(#0) } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL // %bb.1: // %entry.old { r0 = sub(r1,r0) } // encoding: [0x00,0xc1,0x20,0xf3] { if (p0.new) jump:nt .LBB0_5 p0 = cmp.gt(r0,#399) } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: { r0 = #-100 } // encoding: [0x80,0xf3,0xdf,0x78] .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r3 = add(r0,#1) r4 = memw(r2++#4) memw(r1++#4) = r4.new } // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab] { p0 = cmp.gtu(r0,r3); if (!p0.new) jump:t .LBB0_3 r0 = r3 } // encoding: [A,0x63'A',0x40'A',0x15'A',0x00,0xc0,0x63,0x70] // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B9_PCREL // %bb.4: // %while.end { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .LBB0_5: // %while.body.rtli { call memmove r1:0 = combine(r2,r1) r2 = #400 } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .Lfunc_end0: .size hexagon2, .Lfunc_end0-hexagon2 // -- End function This is the assembly code generated by LLVM 7.0 : .text .file "main.c" .globl hexagon2 // -- Begin function hexagon2 .p2align 2 .type hexagon2, at function hexagon2: // @hexagon2 // %bb.0: // %entry.old { p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 r2 = r0 allocframe(#0) } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL // %bb.1: // %entry.old { r0 = sub(r1,r0) } // encoding: [0x00,0xc1,0x20,0xf3] { if (p0.new) jump:nt .LBB0_5 p0 = cmp.gt(r0,#399) } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: // %while.body.preheader { loop0(.LBB0_3,#100) } // encoding: [0x20'A',0xc0'A',0x03'A',0x69'A'] // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B7_PCREL .Ltmp0: // Block address taken .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r0 = memw(r2++#4) memw(r1++#4) = r0.new } :endloop0 // encoding: [0x20,0x80,0x82,0x9b,0x08,0xd2,0xa1,0xab] // %bb.4: // %while.end { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .LBB0_5: // %while.body.rtli { call memmove r1:0 = combine(r2,r1) r2 = #400 } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .Lfunc_end0: .size hexagon2, .Lfunc_end0-hexagon2 // -- End function The code generated by LLVM 7.0 is better than LLVM 9.0 because 9.0 did not made use of Hardware loops. This is in my opinion a bad regression from some earlier version. This is not an isolated case, more cases of the same LLVM 9 ‘defect’ are easy to find. I have investigated the issue and I identified the root cause of it, which is related with the initial use of the “llvm.uadd" intrinsic in LLVM 9.0 to increment the loop Induction Variable, instead of an “add” instruction like LLVM 7.0. This is the while.body excerpt after "CodeGen Prepare” in LLVM 9.0 while.body: ; preds = %entry.old, %while.body %lsr.iv = phi i32 [ %math, %while.body ], [ -100, %entry.old ] %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %entry.old ] %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %entry.old ] %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %lsr.iv, i32 1) %math = extractvalue { i32, i1 } %7, 0 %ov = extractvalue { i32, i1 } %7, 1 %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 br i1 %ov, label %while.end, label %while.body And this is the same excerpt on LLVM 7.0 while.body: ; preds = %while.body.preheader, %while.body %lsr.iv = phi i32 [ -100, %while.body.preheader ], [ %lsr.iv.next, %while.body ] %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %while.body.preheader ] %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %while.body.preheader ] %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 %lsr.iv.next = add nsw i32 %lsr.iv, 1 %tobool = icmp eq i32 %lsr.iv.next, 0 %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 br i1 %tobool, label %while.end, label %while.body LLVM 9 uses “llvm.uadd”. This finally prevents the “Hexagon Hardware Loops” pass to recognise a hardware loop pattern, resulting in sub-optimal code, specially compared with what LLVM 7.0 produces. The code (excerpt) just before the Hexagon Hardware Loops pass on LLVM 9 is this: bb.5: ; predecessors: %bb.1 successors: %bb.3(0x80000000); %bb.3(100.00%) %8:intregs = A2_tfrsi -100 J2_jump %bb.3, implicit-def $pc bb.3.while.body: ; predecessors: %bb.3, %bb.5 successors: %bb.4(0x04000000), %bb.3(0x7c000000); %bb.4(3.12%), %bb.3(96.88%) %0:intregs = PHI %8:intregs, %bb.5, %3:intregs, %bb.3 %1:intregs = PHI %7:intregs, %bb.5, %5:intregs, %bb.3 %2:intregs = PHI %6:intregs, %bb.5, %4:intregs, %bb.3 %13:intregs, %4:intregs = L2_loadri_pi %2:intregs(tied-def 1), 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) %5:intregs = S2_storeri_pi %1:intregs(tied-def 0), 4, %13:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) %3:intregs = A2_addi %0:intregs, 1 %14:predregs = C2_cmpgtu %0:intregs, %3:intregs J2_jumpf %14:predregs, %bb.3, implicit-def dead $pc J2_jump %bb.4, implicit-def dead $pc The same code on LLVM 7 is this: bb.2.while.body.preheader: ; predecessors: %bb.1 successors: %bb.4(0x80000000); %bb.4(200.00%) %11:intregs = A2_tfrsi -100 J2_jump %bb.4, implicit-def dead $pc bb.4.while.body: ; predecessors: %bb.2, %bb.4 successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%) %0:intregs = PHI %11:intregs, %bb.2, %3:intregs, %bb.4 %1:intregs = PHI %7:intregs, %bb.2, %5:intregs, %bb.4 %2:intregs = PHI %6:intregs, %bb.2, %4:intregs, %bb.4 %12:intregs, %4:intregs = L2_loadri_pi %2:intregs, 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) %5:intregs = S2_storeri_pi %1:intregs, 4, %12:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) %3:intregs = A2_addi %0:intregs, 1 %13:predregs = C2_cmpeqi %3:intregs, 0 J2_jumpf %13:predregs, %bb.4, implicit-def dead $pc J2_jump %bb.5, implicit-def dead $pc The differences above allow LLVM 7 to turn %13, %3, %11 into a hardware Loop as shown in the assembly code earlier in this message. However, LLVM 9 can’t identify a Hardware loop pattern due to the odd C2_cmpgtu instruction that gets generated. This instruction is a consequence of the introduction of the “llvm.addu” intrinsic that I showed earlier. I am presenting here the case of Hexagon, but I suspect the same sub-optimal code may happen for the PowerPC (not checked yet). All the code excerpts were obtained with -Os flags I’m highly interested in getting in contact with someone familiar with the Hexagon/PowerPC targets regarding this subject. Any pointers to the right persons would be appreciated. My interest comes from the fact that I am proposing an improvement on the LSR pass that affects all targets and I need hardware loops to be properly generated in LLVM 9 like they used to be in LLVM 7. Thanks John. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190701/4e58c72a/attachment-0001.html>
Joan Lluch via llvm-dev
2019-Jul-02 13:44 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
Hi Sanjay, Thank you for your reply. It was really useful. I implemented one of your suggestions, particularly having ‘shouldFormOverflowOp’ to always return false for the Hexagon target and for my purposes it totally fixed the problem!!. I received another reply from Krzysztof who proposed a custom target solution which also works. I am crossing my replies to both of you so to make you both aware. I paste the link to the code he submitted just to make it easier to find: https://reviews.llvm.org/rG5abf80cdfa34 <https://reviews.llvm.org/rG5abf80cdfa34> Thanks! John> On 1 Jul 2019, at 15:04, Sanjay Patel <spatel at rotateright.com> wrote: > > Hi - > > I don't know much about Hexagon, but I'm probably responsible ( https://reviews.llvm.org/D57789 <https://reviews.llvm.org/D57789> and follow-ons) for the CGP difference that you are seeing. > > There are a couple of ways to deal with this: > 1. Adjust the target hook that controls this transform to return 'false' if there is a better alternative: > /// Try to convert math with an overflow comparison into the corresponding DAG > /// node operation. Targets may want to override this independently of whether > /// the operation is legal/custom for the given type because it may obscure > /// matching of other patterns. > virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const { > > 2. Enhance the pattern-matching that forms a hardware loop to accept the overflow intrinsics. > > Either way, please file bugs and/or add some regression tests for the problem cases on these targets. If those had existed before, I would have noticed the problem and tried to solve it as part of the initial change. > > On Sun, Jun 30, 2019 at 1:04 PM Joan Lluch via llvm-dev <llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>> wrote: > Hi All, > > The following code : > > void hexagon2( int *a, int *res ) > { > int i = 100; > while ( i-- ) { > *res++ = *a++; > } > } > > gets compiled as a sub-optimal Software loop on LLVM 9.0 instead of a Hardware loop, whereas it was compiled as a Hardware Loop in LLVM 7.0. > > > This is the final assembly code generated by LLVM 9.0 : > > > .text > .file "main.c" > .globl hexagon2 // -- Begin function hexagon2 > .p2align 2 > .type hexagon2, at function > hexagon2: // @hexagon2 > // %bb.0: // %entry.old > { > p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 > r2 = r0 > allocframe(#0) > } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL > // %bb.1: // %entry.old > { > r0 = sub(r1,r0) > } // encoding: [0x00,0xc1,0x20,0xf3] > { > if (p0.new) jump:nt .LBB0_5 > p0 = cmp.gt <http://cmp.gt/>(r0,#399) > } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL > // %bb.2: > { > r0 = #-100 > } // encoding: [0x80,0xf3,0xdf,0x78] > .LBB0_3: // %while.body > // =>This Inner Loop Header: Depth=1 > { > r3 = add(r0,#1) > r4 = memw(r2++#4) > memw(r1++#4) = r4.new > } // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab] > { > p0 = cmp.gtu(r0,r3); if (!p0.new) jump:t .LBB0_3 > r0 = r3 > } // encoding: [A,0x63'A',0x40'A',0x15'A',0x00,0xc0,0x63,0x70] > // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B9_PCREL > // %bb.4: // %while.end > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .LBB0_5: // %while.body.rtli > { > call memmove > r1:0 = combine(r2,r1) > r2 = #400 > } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] > // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .Lfunc_end0: > .size hexagon2, .Lfunc_end0-hexagon2 > // -- End function > > > > This is the assembly code generated by LLVM 7.0 : > > .text > .file "main.c" > .globl hexagon2 // -- Begin function hexagon2 > .p2align 2 > .type hexagon2, at function > hexagon2: // @hexagon2 > // %bb.0: // %entry.old > { > p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 > r2 = r0 > allocframe(#0) > } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL > // %bb.1: // %entry.old > { > r0 = sub(r1,r0) > } // encoding: [0x00,0xc1,0x20,0xf3] > { > if (p0.new) jump:nt .LBB0_5 > p0 = cmp.gt <http://cmp.gt/>(r0,#399) > } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL > // %bb.2: // %while.body.preheader > { > loop0(.LBB0_3,#100) > } // encoding: [0x20'A',0xc0'A',0x03'A',0x69'A'] > // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B7_PCREL > .Ltmp0: // Block address taken > .LBB0_3: // %while.body > // =>This Inner Loop Header: Depth=1 > { > r0 = memw(r2++#4) > memw(r1++#4) = r0.new > } :endloop0 // encoding: [0x20,0x80,0x82,0x9b,0x08,0xd2,0xa1,0xab] > // %bb.4: // %while.end > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .LBB0_5: // %while.body.rtli > { > call memmove > r1:0 = combine(r2,r1) > r2 = #400 > } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] > // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .Lfunc_end0: > .size hexagon2, .Lfunc_end0-hexagon2 > // -- End function > > > The code generated by LLVM 7.0 is better than LLVM 9.0 because 9.0 did not made use of Hardware loops. This is in my opinion a bad regression from some earlier version. This is not an isolated case, more cases of the same LLVM 9 ‘defect’ are easy to find. > > I have investigated the issue and I identified the root cause of it, which is related with the initial use of the “llvm.uadd" intrinsic in LLVM 9.0 to increment the loop Induction Variable, instead of an “add” instruction like LLVM 7.0. > > > This is the while.body excerpt after "CodeGen Prepare” in LLVM 9.0 > > while.body: ; preds = %entry.old, %while.body > %lsr.iv = phi i32 [ %math, %while.body ], [ -100, %entry.old ] > %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %entry.old ] > %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %entry.old ] > %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 > store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 > %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %lsr.iv, i32 1) > %math = extractvalue { i32, i1 } %7, 0 > %ov = extractvalue { i32, i1 } %7, 1 > %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 > %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 > br i1 %ov, label %while.end, label %while.body > > > And this is the same excerpt on LLVM 7.0 > > while.body: ; preds = %while.body.preheader, %while.body > %lsr.iv = phi i32 [ -100, %while.body.preheader ], [ %lsr.iv.next, %while.body ] > %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %while.body.preheader ] > %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %while.body.preheader ] > %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 > store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 > %lsr.iv.next = add nsw i32 %lsr.iv, 1 > %tobool = icmp eq i32 %lsr.iv.next, 0 > %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 > %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 > br i1 %tobool, label %while.end, label %while.body > > > LLVM 9 uses “llvm.uadd”. This finally prevents the “Hexagon Hardware Loops” pass to recognise a hardware loop pattern, resulting in sub-optimal code, specially compared with what LLVM 7.0 produces. > > The code (excerpt) just before the Hexagon Hardware Loops pass on LLVM 9 is this: > > bb.5: > ; predecessors: %bb.1 > successors: %bb.3(0x80000000); %bb.3(100.00%) > > %8:intregs = A2_tfrsi -100 > J2_jump %bb.3, implicit-def $pc > > bb.3.while.body: > ; predecessors: %bb.3, %bb.5 > successors: %bb.4(0x04000000), %bb.3(0x7c000000); %bb.4(3.12%), %bb.3(96.88%) > > %0:intregs = PHI %8:intregs, %bb.5, %3:intregs, %bb.3 > %1:intregs = PHI %7:intregs, %bb.5, %5:intregs, %bb.3 > %2:intregs = PHI %6:intregs, %bb.5, %4:intregs, %bb.3 > %13:intregs, %4:intregs = L2_loadri_pi %2:intregs(tied-def 1), 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) > %5:intregs = S2_storeri_pi %1:intregs(tied-def 0), 4, %13:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) > %3:intregs = A2_addi %0:intregs, 1 > %14:predregs = C2_cmpgtu %0:intregs, %3:intregs > J2_jumpf %14:predregs, %bb.3, implicit-def dead $pc > J2_jump %bb.4, implicit-def dead $pc > > > The same code on LLVM 7 is this: > > bb.2.while.body.preheader: > ; predecessors: %bb.1 > successors: %bb.4(0x80000000); %bb.4(200.00%) > > %11:intregs = A2_tfrsi -100 > J2_jump %bb.4, implicit-def dead $pc > > bb.4.while.body: > ; predecessors: %bb.2, %bb.4 > successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%) > > %0:intregs = PHI %11:intregs, %bb.2, %3:intregs, %bb.4 > %1:intregs = PHI %7:intregs, %bb.2, %5:intregs, %bb.4 > %2:intregs = PHI %6:intregs, %bb.2, %4:intregs, %bb.4 > %12:intregs, %4:intregs = L2_loadri_pi %2:intregs, 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) > %5:intregs = S2_storeri_pi %1:intregs, 4, %12:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) > %3:intregs = A2_addi %0:intregs, 1 > %13:predregs = C2_cmpeqi %3:intregs, 0 > J2_jumpf %13:predregs, %bb.4, implicit-def dead $pc > J2_jump %bb.5, implicit-def dead $pc > > > The differences above allow LLVM 7 to turn %13, %3, %11 into a hardware Loop as shown in the assembly code earlier in this message. However, LLVM 9 can’t identify a Hardware loop pattern due to the odd C2_cmpgtu instruction that gets generated. This instruction is a consequence of the introduction of the “llvm.addu” intrinsic that I showed earlier. > > I am presenting here the case of Hexagon, but I suspect the same sub-optimal code may happen for the PowerPC (not checked yet). > > All the code excerpts were obtained with -Os flags > > I’m highly interested in getting in contact with someone familiar with the Hexagon/PowerPC targets regarding this subject. Any pointers to the right persons would be appreciated. My interest comes from the fact that I am proposing an improvement on the LSR pass that affects all targets and I need hardware loops to be properly generated in LLVM 9 like they used to be in LLVM 7. > > Thanks > > John. > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org> > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev <https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev>-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190702/f9af704f/attachment-0001.html>
Joan Lluch via llvm-dev
2019-Jul-02 13:50 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
Hi Krzysztof, Thank you very much for your quick reaction. This is very much appreciated. I have pulled your code from the main LLVM repo and found that it works fine!. However, after looking at it I have a question about it. My understanding is that your code only replaces uaddo, usubo with ‘1’ as the second operand. This certainly solves the hardware loop problem and I assume it’s enough, as there’s no possible loops with scalable increments? I also want to mention that based on the reply I received from Sanjay, I also tried to override ‘shouldFormOverflowOp’ to always return false. This also solves the issue, but of course it prevents the ‘.with.overflow’ intrinsics to be generated at all. My understanding is that this fully mimics LLVM 7.0 behaviour, but I do not have the required knowledge on Hexagon to give an opinion about whether this is useful. I’m just posting this as a matter of information Thanks again! John> On 1 Jul 2019, at 17:51, Krzysztof Parzyszek <kparzysz at quicinc.com <mailto:kparzysz at quicinc.com>> wrote: > > The Hexagon part is fixed in r364790. > > -- > Krzysztof Parzyszek kparzysz at quicinc.com <mailto:kparzysz at quicinc.com> LLVM compiler development >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190702/3f75791d/attachment.html>
Krzysztof Parzyszek via llvm-dev
2019-Jul-02 14:40 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
Hi John, The approach I took here was to deal with the intrinsics once they’re generated. The hardware loop code tries to evaluate the iteration count, and when it fails to do so, it skips the loop. The complication with unsigned overflow as a loop exit condition is that it makes calculating of the iteration count harder, and the current code for hardware loop doesn’t handle it. It should be possible to expand it, but it would be a non-trivial amount of work. Instead, I decided to rewrite the simple cases of unsigned add/sub, where the overflow condition can be rewritten in a way that is currently recognized. Varying increments are not translatable into a hardware loop, but increments that are not 1 are, as long as we can calculate the iteration count (the hardware loop instruction takes a register containing the iteration count as an operand). My guess is that if there was a loop with a non-unit increment, whose exit condition was an unsigned overflow, that loop would not have been handled by the hardware loop generation to begin with, so only dealing with +/-1 should be sufficient. In the long term improving the hardware loop generation would be a better approach, but it’s more work. -- Krzysztof Parzyszek kparzysz at quicinc.com<mailto:kparzysz at quicinc.com> LLVM compiler development From: Joan Lluch <joan.lluch at icloud.com> Sent: Tuesday, July 2, 2019 8:45 AM To: Krzysztof Parzyszek <kparzysz at quicinc.com> Cc: llvm-dev at lists.llvm.org; Sanjay Patel <spatel at rotateright.com> Subject: [EXT] Re: [llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic. Hi Krzysztof, Thank you very much for such a quick reaction. This is very much appreciated. I have pulled your code from the main LLVM repo and found that it works fine!. However, after looking at it I have a question about it. My understanding is that your code only replaces uaddo, usubo with ‘1’ as the second operand. This certainly solves the hardware loop problem and I assume it’s enough, as there’s no possible loops with scalable increments? I also want to mention that based on the reply I received from Sanjay, I also tried to override ‘shouldFormOverflowOp’ to always return false. This also solves the issue, but of course it prevents the ‘.with.overflow’ intrinsics to be generated at all. My understanding is that this fully mimics LLVM 7.0 behaviour, but I do not have the required knowledge on Hexagon to give an opinion about whether this is useful. I’m just posting this as a matter of information Thanks again! John On 1 Jul 2019, at 17:51, Krzysztof Parzyszek <kparzysz at quicinc.com<mailto:kparzysz at quicinc.com>> wrote: The Hexagon part is fixed in r364790. -- Krzysztof Parzyszek kparzysz at quicinc.com<mailto:kparzysz at quicinc.com> LLVM compiler development From: llvm-dev <llvm-dev-bounces at lists.llvm.org<mailto:llvm-dev-bounces at lists.llvm.org>> On Behalf Of Joan Lluch via llvm-dev Sent: Sunday, June 30, 2019 2:04 PM To: llvm-dev <llvm-dev at lists.llvm.org<mailto:llvm-dev at lists.llvm.org>> Subject: [EXT] [llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic. Hi All, The following code : void hexagon2( int *a, int *res ) { int i = 100; while ( i-- ) { *res++ = *a++; } } gets compiled as a sub-optimal Software loop on LLVM 9.0 instead of a Hardware loop, whereas it was compiled as a Hardware Loop in LLVM 7.0. This is the final assembly code generated by LLVM 9.0 : .text .file "main.c" .globl hexagon2 // -- Begin function hexagon2 .p2align 2 .type hexagon2, at function hexagon2: // @hexagon2 // %bb.0: // %entry.old { p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 r2 = r0 allocframe(#0) } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL // %bb.1: // %entry.old { r0 = sub(r1,r0) } // encoding: [0x00,0xc1,0x20,0xf3] { if (p0.new) jump:nt .LBB0_5 p0 = cmp.gt(r0,#399) } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: { r0 = #-100 } // encoding: [0x80,0xf3,0xdf,0x78] .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r3 = add(r0,#1) r4 = memw(r2++#4) memw(r1++#4) = r4.new } // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab] { p0 = cmp.gtu(r0,r3); if (!p0.new) jump:t .LBB0_3 r0 = r3 } // encoding: [A,0x63'A',0x40'A',0x15'A',0x00,0xc0,0x63,0x70] // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B9_PCREL // %bb.4: // %while.end { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .LBB0_5: // %while.body.rtli { call memmove r1:0 = combine(r2,r1) r2 = #400 } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .Lfunc_end0: .size hexagon2, .Lfunc_end0-hexagon2 // -- End function This is the assembly code generated by LLVM 7.0 : .text .file "main.c" .globl hexagon2 // -- Begin function hexagon2 .p2align 2 .type hexagon2, at function hexagon2: // @hexagon2 // %bb.0: // %entry.old { p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 r2 = r0 allocframe(#0) } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL // %bb.1: // %entry.old { r0 = sub(r1,r0) } // encoding: [0x00,0xc1,0x20,0xf3] { if (p0.new) jump:nt .LBB0_5 p0 = cmp.gt(r0,#399) } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: // %while.body.preheader { loop0(.LBB0_3,#100) } // encoding: [0x20'A',0xc0'A',0x03'A',0x69'A'] // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B7_PCREL .Ltmp0: // Block address taken .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r0 = memw(r2++#4) memw(r1++#4) = r0.new } :endloop0 // encoding: [0x20,0x80,0x82,0x9b,0x08,0xd2,0xa1,0xab] // %bb.4: // %while.end { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .LBB0_5: // %while.body.rtli { call memmove r1:0 = combine(r2,r1) r2 = #400 } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL { r31:30 = dealloc_return(r30):raw } // encoding: [0x1e,0xc0,0x1e,0x96] .Lfunc_end0: .size hexagon2, .Lfunc_end0-hexagon2 // -- End function The code generated by LLVM 7.0 is better than LLVM 9.0 because 9.0 did not made use of Hardware loops. This is in my opinion a bad regression from some earlier version. This is not an isolated case, more cases of the same LLVM 9 ‘defect’ are easy to find. I have investigated the issue and I identified the root cause of it, which is related with the initial use of the “llvm.uadd" intrinsic in LLVM 9.0 to increment the loop Induction Variable, instead of an “add” instruction like LLVM 7.0. This is the while.body excerpt after "CodeGen Prepare” in LLVM 9.0 while.body: ; preds = %entry.old, %while.body %lsr.iv = phi i32 [ %math, %while.body ], [ -100, %entry.old ] %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %entry.old ] %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %entry.old ] %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %lsr.iv, i32 1) %math = extractvalue { i32, i1 } %7, 0 %ov = extractvalue { i32, i1 } %7, 1 %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 br i1 %ov, label %while.end, label %while.body And this is the same excerpt on LLVM 7.0 while.body: ; preds = %while.body.preheader, %while.body %lsr.iv = phi i32 [ -100, %while.body.preheader ], [ %lsr.iv.next, %while.body ] %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %while.body.preheader ] %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %while.body.preheader ] %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 %lsr.iv.next = add nsw i32 %lsr.iv, 1 %tobool = icmp eq i32 %lsr.iv.next, 0 %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 br i1 %tobool, label %while.end, label %while.body LLVM 9 uses “llvm.uadd”. This finally prevents the “Hexagon Hardware Loops” pass to recognise a hardware loop pattern, resulting in sub-optimal code, specially compared with what LLVM 7.0 produces. The code (excerpt) just before the Hexagon Hardware Loops pass on LLVM 9 is this: bb.5: ; predecessors: %bb.1 successors: %bb.3(0x80000000); %bb.3(100.00%) %8:intregs = A2_tfrsi -100 J2_jump %bb.3, implicit-def $pc bb.3.while.body: ; predecessors: %bb.3, %bb.5 successors: %bb.4(0x04000000), %bb.3(0x7c000000); %bb.4(3.12%), %bb.3(96.88%) %0:intregs = PHI %8:intregs, %bb.5, %3:intregs, %bb.3 %1:intregs = PHI %7:intregs, %bb.5, %5:intregs, %bb.3 %2:intregs = PHI %6:intregs, %bb.5, %4:intregs, %bb.3 %13:intregs, %4:intregs = L2_loadri_pi %2:intregs(tied-def 1), 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) %5:intregs = S2_storeri_pi %1:intregs(tied-def 0), 4, %13:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) %3:intregs = A2_addi %0:intregs, 1 %14:predregs = C2_cmpgtu %0:intregs, %3:intregs J2_jumpf %14:predregs, %bb.3, implicit-def dead $pc J2_jump %bb.4, implicit-def dead $pc The same code on LLVM 7 is this: bb.2.while.body.preheader: ; predecessors: %bb.1 successors: %bb.4(0x80000000); %bb.4(200.00%) %11:intregs = A2_tfrsi -100 J2_jump %bb.4, implicit-def dead $pc bb.4.while.body: ; predecessors: %bb.2, %bb.4 successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%) %0:intregs = PHI %11:intregs, %bb.2, %3:intregs, %bb.4 %1:intregs = PHI %7:intregs, %bb.2, %5:intregs, %bb.4 %2:intregs = PHI %6:intregs, %bb.2, %4:intregs, %bb.4 %12:intregs, %4:intregs = L2_loadri_pi %2:intregs, 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) %5:intregs = S2_storeri_pi %1:intregs, 4, %12:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) %3:intregs = A2_addi %0:intregs, 1 %13:predregs = C2_cmpeqi %3:intregs, 0 J2_jumpf %13:predregs, %bb.4, implicit-def dead $pc J2_jump %bb.5, implicit-def dead $pc The differences above allow LLVM 7 to turn %13, %3, %11 into a hardware Loop as shown in the assembly code earlier in this message. However, LLVM 9 can’t identify a Hardware loop pattern due to the odd C2_cmpgtu instruction that gets generated. This instruction is a consequence of the introduction of the “llvm.addu” intrinsic that I showed earlier. I am presenting here the case of Hexagon, but I suspect the same sub-optimal code may happen for the PowerPC (not checked yet). All the code excerpts were obtained with -Os flags I’m highly interested in getting in contact with someone familiar with the Hexagon/PowerPC targets regarding this subject. Any pointers to the right persons would be appreciated. My interest comes from the fact that I am proposing an improvement on the LSR pass that affects all targets and I need hardware loops to be properly generated in LLVM 9 like they used to be in LLVM 7. Thanks John. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190702/02fe7532/attachment-0001.html>
Joan Lluch via llvm-dev
2019-Jul-03 07:23 UTC
[llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
Hi Krzysztof, Thanks for that. This makes a lot of sense. John.> On 2 Jul 2019, at 16:40, Krzysztof Parzyszek <kparzysz at quicinc.com> wrote: > > Hi John, > > The approach I took here was to deal with the intrinsics once they’re generated. The hardware loop code tries to evaluate the iteration count, and when it fails to do so, it skips the loop. The complication with unsigned overflow as a loop exit condition is that it makes calculating of the iteration count harder, and the current code for hardware loop doesn’t handle it. It should be possible to expand it, but it would be a non-trivial amount of work. Instead, I decided to rewrite the simple cases of unsigned add/sub, where the overflow condition can be rewritten in a way that is currently recognized. > Varying increments are not translatable into a hardware loop, but increments that are not 1 are, as long as we can calculate the iteration count (the hardware loop instruction takes a register containing the iteration count as an operand). My guess is that if there was a loop with a non-unit increment, whose exit condition was an unsigned overflow, that loop would not have been handled by the hardware loop generation to begin with, so only dealing with +/-1 should be sufficient. In the long term improving the hardware loop generation would be a better approach, but it’s more work. > > > -- > Krzysztof Parzyszek kparzysz at quicinc.com <mailto:kparzysz at quicinc.com> LLVM compiler development > > From: Joan Lluch <joan.lluch at icloud.com <mailto:joan.lluch at icloud.com>> > Sent: Tuesday, July 2, 2019 8:45 AM > To: Krzysztof Parzyszek <kparzysz at quicinc.com <mailto:kparzysz at quicinc.com>> > Cc: llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>; Sanjay Patel <spatel at rotateright.com <mailto:spatel at rotateright.com>> > Subject: [EXT] Re: [llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic. > > Hi Krzysztof, > > Thank you very much for such a quick reaction. This is very much appreciated. > > I have pulled your code from the main LLVM repo and found that it works fine!. However, after looking at it I have a question about it. My understanding is that your code only replaces uaddo, usubo with ‘1’ as the second operand. This certainly solves the hardware loop problem and I assume it’s enough, as there’s no possible loops with scalable increments? > > I also want to mention that based on the reply I received from Sanjay, I also tried to override ‘shouldFormOverflowOp’ to always return false. This also solves the issue, but of course it prevents the ‘.with.overflow’ intrinsics to be generated at all. My understanding is that this fully mimics LLVM 7.0 behaviour, but I do not have the required knowledge on Hexagon to give an opinion about whether this is useful. I’m just posting this as a matter of information > > Thanks again! > > John > > > On 1 Jul 2019, at 17:51, Krzysztof Parzyszek <kparzysz at quicinc.com <mailto:kparzysz at quicinc.com>> wrote: > > The Hexagon part is fixed in r364790. > > -- > Krzysztof Parzyszek kparzysz at quicinc.com <mailto:kparzysz at quicinc.com> LLVM compiler development > > From: llvm-dev <llvm-dev-bounces at lists.llvm.org <mailto:llvm-dev-bounces at lists.llvm.org>> On Behalf Of Joan Lluch via llvm-dev > Sent: Sunday, June 30, 2019 2:04 PM > To: llvm-dev <llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>> > Subject: [EXT] [llvm-dev] [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic. > > Hi All, > > The following code : > > void hexagon2( int *a, int *res ) > { > int i = 100; > while ( i-- ) { > *res++ = *a++; > } > } > > gets compiled as a sub-optimal Software loop on LLVM 9.0 instead of a Hardware loop, whereas it was compiled as a Hardware Loop in LLVM 7.0. > > > This is the final assembly code generated by LLVM 9.0 : > > > .text > .file "main.c" > .globl hexagon2 // -- Begin function hexagon2 > .p2align 2 > .type hexagon2, at function > hexagon2: // @hexagon2 > // %bb.0: // %entry.old > { > p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 > r2 = r0 > allocframe(#0) > } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL > // %bb.1: // %entry.old > { > r0 = sub(r1,r0) > } // encoding: [0x00,0xc1,0x20,0xf3] > { > if (p0.new) jump:nt .LBB0_5 > p0 = cmp.gt(r0,#399) > } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL > // %bb.2: > { > r0 = #-100 > } // encoding: [0x80,0xf3,0xdf,0x78] > .LBB0_3: // %while.body > // =>This Inner Loop Header: Depth=1 > { > r3 = add(r0,#1) > r4 = memw(r2++#4) > memw(r1++#4) = r4.new > } // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab] > { > p0 = cmp.gtu(r0,r3); if (!p0.new) jump:t .LBB0_3 > r0 = r3 > } // encoding: [A,0x63'A',0x40'A',0x15'A',0x00,0xc0,0x63,0x70] > // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B9_PCREL > // %bb.4: // %while.end > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .LBB0_5: // %while.body.rtli > { > call memmove > r1:0 = combine(r2,r1) > r2 = #400 > } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] > // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .Lfunc_end0: > .size hexagon2, .Lfunc_end0-hexagon2 > // -- End function > > > > This is the assembly code generated by LLVM 7.0 : > > .text > .file "main.c" > .globl hexagon2 // -- Begin function hexagon2 > .p2align 2 > .type hexagon2, at function > hexagon2: // @hexagon2 > // %bb.0: // %entry.old > { > p0 = cmp.gtu(r0,r1); if (p0.new) jump:nt .LBB0_5 > r2 = r0 > allocframe(#0) > } // encoding: [A,0x41'A',A,0x15'A',0x00,0x3c,0x02,0x70] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B9_PCREL > // %bb.1: // %entry.old > { > r0 = sub(r1,r0) > } // encoding: [0x00,0xc1,0x20,0xf3] > { > if (p0.new) jump:nt .LBB0_5 > p0 = cmp.gt(r0,#399) > } // encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] > // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL > // %bb.2: // %while.body.preheader > { > loop0(.LBB0_3,#100) > } // encoding: [0x20'A',0xc0'A',0x03'A',0x69'A'] > // fixup A - offset: 0, value: .LBB0_3, kind: fixup_Hexagon_B7_PCREL > .Ltmp0: // Block address taken > .LBB0_3: // %while.body > // =>This Inner Loop Header: Depth=1 > { > r0 = memw(r2++#4) > memw(r1++#4) = r0.new > } :endloop0 // encoding: [0x20,0x80,0x82,0x9b,0x08,0xd2,0xa1,0xab] > // %bb.4: // %while.end > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .LBB0_5: // %while.body.rtli > { > call memmove > r1:0 = combine(r2,r1) > r2 = #400 > } // encoding: [A,0x40'A',A,0x5a'A',0x00,0x41,0x02,0xf5,0x02,0xf2,0x00,0x78] > // fixup A - offset: 0, value: memmove, kind: fixup_Hexagon_B22_PCREL > { > r31:30 = dealloc_return(r30):raw > } // encoding: [0x1e,0xc0,0x1e,0x96] > .Lfunc_end0: > .size hexagon2, .Lfunc_end0-hexagon2 > // -- End function > > > The code generated by LLVM 7.0 is better than LLVM 9.0 because 9.0 did not made use of Hardware loops. This is in my opinion a bad regression from some earlier version. This is not an isolated case, more cases of the same LLVM 9 ‘defect’ are easy to find. > > I have investigated the issue and I identified the root cause of it, which is related with the initial use of the “llvm.uadd" intrinsic in LLVM 9.0 to increment the loop Induction Variable, instead of an “add” instruction like LLVM 7.0. > > > This is the while.body excerpt after "CodeGen Prepare” in LLVM 9.0 > > while.body: ; preds = %entry.old, %while.body > %lsr.iv = phi i32 [ %math, %while.body ], [ -100, %entry.old ] > %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %entry.old ] > %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %entry.old ] > %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 > store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 > %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %lsr.iv, i32 1) > %math = extractvalue { i32, i1 } %7, 0 > %ov = extractvalue { i32, i1 } %7, 1 > %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 > %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 > br i1 %ov, label %while.end, label %while.body > > > And this is the same excerpt on LLVM 7.0 > > while.body: ; preds = %while.body.preheader, %while.body > %lsr.iv = phi i32 [ -100, %while.body.preheader ], [ %lsr.iv.next, %while.body ] > %res.addr.04 = phi i32* [ %cgep1, %while.body ], [ %res, %while.body.preheader ] > %a.addr.03 = phi i32* [ %cgep, %while.body ], [ %a, %while.body.preheader ] > %6 = load i32, i32* %a.addr.03, align 4, !tbaa !2 > store i32 %6, i32* %res.addr.04, align 4, !tbaa !2 > %lsr.iv.next = add nsw i32 %lsr.iv, 1 > %tobool = icmp eq i32 %lsr.iv.next, 0 > %cgep = getelementptr inbounds i32, i32* %a.addr.03, i32 1 > %cgep1 = getelementptr inbounds i32, i32* %res.addr.04, i32 1 > br i1 %tobool, label %while.end, label %while.body > > > LLVM 9 uses “llvm.uadd”. This finally prevents the “Hexagon Hardware Loops” pass to recognise a hardware loop pattern, resulting in sub-optimal code, specially compared with what LLVM 7.0 produces. > > The code (excerpt) just before the Hexagon Hardware Loops pass on LLVM 9 is this: > > bb.5: > ; predecessors: %bb.1 > successors: %bb.3(0x80000000); %bb.3(100.00%) > > %8:intregs = A2_tfrsi -100 > J2_jump %bb.3, implicit-def $pc > > > > bb.3.while.body: > ; predecessors: %bb.3, %bb.5 > successors: %bb.4(0x04000000), %bb.3(0x7c000000); %bb.4(3.12%), %bb.3(96.88%) > > %0:intregs = PHI %8:intregs, %bb.5, %3:intregs, %bb.3 > %1:intregs = PHI %7:intregs, %bb.5, %5:intregs, %bb.3 > %2:intregs = PHI %6:intregs, %bb.5, %4:intregs, %bb.3 > %13:intregs, %4:intregs = L2_loadri_pi %2:intregs(tied-def 1), 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) > %5:intregs = S2_storeri_pi %1:intregs(tied-def 0), 4, %13:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) > %3:intregs = A2_addi %0:intregs, 1 > %14:predregs = C2_cmpgtu %0:intregs, %3:intregs > J2_jumpf %14:predregs, %bb.3, implicit-def dead $pc > J2_jump %bb.4, implicit-def dead $pc > > > > > The same code on LLVM 7 is this: > > > > bb.2.while.body.preheader: > ; predecessors: %bb.1 > successors: %bb.4(0x80000000); %bb.4(200.00%) > > %11:intregs = A2_tfrsi -100 > J2_jump %bb.4, implicit-def dead $pc > > > > bb.4.while.body: > ; predecessors: %bb.2, %bb.4 > successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%) > > %0:intregs = PHI %11:intregs, %bb.2, %3:intregs, %bb.4 > %1:intregs = PHI %7:intregs, %bb.2, %5:intregs, %bb.4 > %2:intregs = PHI %6:intregs, %bb.2, %4:intregs, %bb.4 > %12:intregs, %4:intregs = L2_loadri_pi %2:intregs, 4 :: (load 4 from %ir.a.addr.03, !tbaa !2) > %5:intregs = S2_storeri_pi %1:intregs, 4, %12:intregs :: (store 4 into %ir.res.addr.04, !tbaa !2) > %3:intregs = A2_addi %0:intregs, 1 > %13:predregs = C2_cmpeqi %3:intregs, 0 > J2_jumpf %13:predregs, %bb.4, implicit-def dead $pc > J2_jump %bb.5, implicit-def dead $pc > > > The differences above allow LLVM 7 to turn %13, %3, %11 into a hardware Loop as shown in the assembly code earlier in this message. However, LLVM 9 can’t identify a Hardware loop pattern due to the odd C2_cmpgtu instruction that gets generated. This instruction is a consequence of the introduction of the “llvm.addu” intrinsic that I showed earlier. > > I am presenting here the case of Hexagon, but I suspect the same sub-optimal code may happen for the PowerPC (not checked yet). > > All the code excerpts were obtained with -Os flags > > I’m highly interested in getting in contact with someone familiar with the Hexagon/PowerPC targets regarding this subject. Any pointers to the right persons would be appreciated. My interest comes from the fact that I am proposing an improvement on the LSR pass that affects all targets and I need hardware loops to be properly generated in LLVM 9 like they used to be in LLVM 7. > > Thanks > > John.-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190703/1e7663a5/attachment-0001.html>
Reasonably Related Threads
- [hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
- [LLVMdev] Target Dependent Hexagon Packetizer patch
- Tail merging "undef" with a defined register: wrong code
- Tail merging "undef" with a defined register: wrong code
- Tail merging "undef" with a defined register: wrong code