Wei Mi via llvm-dev
2017-Dec-19 00:14 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
Hi, Recently 10% performance regression on an important benchmark showed up after we integrated https://reviews.llvm.org/rL318299. The analysis showed that rL318299 triggered loop rotation on an multi exits loop, and the loop rotation introduced code layout issue. The performance regression is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to illustrate the problem. a.ll was generated by rL318298 and b.ll was generated by rL318299. -------------------------- a.ll ---------------------------- declare void @_Z1fv() local_unnamed_addr #2 @i = global i8 0, align 1 define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone returned %p3) local_unnamed_addr #3 { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] %cmp = icmp ugt i8* %h.addr.0, @i br i1 %cmp, label %while.end, label %while.body while.body: ; preds = %while.cond %0 = bitcast i8* %d.addr.0 to i64* %1 = load i64, i64* %0, align 1 %2 = bitcast i8* %h.addr.0 to i64* store i64 %1, i64* %2, align 1 %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 %3 = bitcast i8* %add.ptr to i64* %4 = load i64, i64* %3, align 1 store i64 %4, i64* %2, align 1 %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 %cmp5 = icmp ult i8* %add.ptr4, %p3 br i1 %cmp5, label %while.cond, label %return while.end: ; preds = %while.cond tail call void @_Z1fv() unreachable return: ; preds = %while.body ret i8* %p3 } -------------------------- b.ll ---------------------------- declare void @_Z1fv() local_unnamed_addr #2 @i = global i8 0, align 1 define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone returned %p3) local_unnamed_addr #3 { entry: br label %while.cond while.cond: ; preds = %cleanup.cont, %entry %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ] %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ] %cmp = icmp ugt i8* %h.addr.0, @i br i1 %cmp, label %while.end, label %while.body while.body: ; preds = %while.cond %0 = bitcast i8* %d.addr.0 to i64* %1 = load i64, i64* %0, align 1 %2 = bitcast i8* %h.addr.0 to i64* store i64 %1, i64* %2, align 1 %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 %3 = bitcast i8* %add.ptr to i64* %4 = load i64, i64* %3, align 1 store i64 %4, i64* %2, align 1 %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 %cmp5 = icmp ult i8* %add.ptr4, %p3 br i1 %cmp5, label %cleanup.cont, label %return cleanup.cont: ; preds = %while.body %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 br label %while.cond while.end: ; preds = %while.cond tail call void @_Z1fv() unreachable return: ; preds = %while.body ret i8* %p3 } The only difference between a.ll and b.ll is the basicblock cleanup.cont. -------------------------- a.ll after loop rotate, same as a.ll before loop rotate ---------------------------- ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll ; ModuleID = '<stdin>' source_filename = "<stdin>" @i = global i8 0, align 1 declare void @_Z1fv() local_unnamed_addr define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone returned %p3) local_unnamed_addr { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] %cmp = icmp ugt i8* %h.addr.0, @i br i1 %cmp, label %while.end, label %while.body while.body: ; preds = %while.cond %0 = bitcast i8* %d.addr.0 to i64* %1 = load i64, i64* %0, align 1 %2 = bitcast i8* %h.addr.0 to i64* store i64 %1, i64* %2, align 1 %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 %3 = bitcast i8* %add.ptr to i64* %4 = load i64, i64* %3, align 1 store i64 %4, i64* %2, align 1 %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 %cmp5 = icmp ult i8* %add.ptr4, %p3 br i1 %cmp5, label %while.cond, label %return while.end: ; preds = %while.cond tail call void @_Z1fv() unreachable return: ; preds = %while.body ret i8* %p3 } -------------------------- b.ll after loop rotate ---------------------------- ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll @i = global i8 0, align 1 declare void @_Z1fv() local_unnamed_addr define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone returned %p3) local_unnamed_addr { entry: %cmp1 = icmp ugt i8* %h, @i br i1 %cmp1, label %while.end, label %while.body.lr.ph while.body.lr.ph: ; preds = %entry br label %while.body while.cond: ; preds = %while.body %h.addr.0 = phi i8* [ %add.ptr4, %while.body ] %d.addr.0 = phi i8* [ %add.ptr3, %while.body ] %cmp = icmp ugt i8* %h.addr.0, @i br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body while.body: ; preds = % while.body.lr.ph, %while.cond %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, %while.cond ] %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, %while.cond ] %0 = bitcast i8* %d.addr.03 to i64* %1 = load i64, i64* %0, align 1 %2 = bitcast i8* %h.addr.02 to i64* store i64 %1, i64* %2, align 1 %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8 %3 = bitcast i8* %add.ptr to i64* %4 = load i64, i64* %3, align 1 store i64 %4, i64* %2, align 1 %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6 %cmp5 = icmp ult i8* %add.ptr4, %p3 %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6 br i1 %cmp5, label %while.cond, label %return while.cond.while.end_crit_edge: ; preds = %while.cond br label %while.end while.end: ; preds %while.cond.while.end_crit_edge, %entry tail call void @_Z1fv() unreachable return: ; preds = %while.body ret i8* %p3 } a.ll and b.ll have different results after loop rotation because of http://llvm.org/viewvc/llvm-project?view=revision&revision=181230 <https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg> -------------------------- a.s generated from a.ll ---------------------------- ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll |~/workarea/llvm-r318298/dbuild/bin/llc .cfi_startproc # BB#0: # %entry pushq %rax .cfi_def_cfa_offset 16 movl $i, %eax .p2align 4, 0x90 .LBB0_1: # %while.cond # =>This Inner Loop Header: Depth=1 cmpq %rax, %rsi ja .LBB0_4 # BB#2: # %while.body # in Loop: Header=BB0_1 Depth=1 movq (%rdi), %rcx movq %rcx, (%rsi) movq 8(%rdi), %rcx movq %rcx, (%rsi) addq $6, %rdi addq $6, %rsi cmpq %rdx, %rsi jb .LBB0_1 # BB#3: # %return movq %rdx, %rax popq %rcx retq .LBB0_4: # %while.end callq _Z1fv .Lfunc_end0: .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ .cfi_endproc call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s will contain mostly fall through branches. -------------------------- b.s generated from b.ll ---------------------------- ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll |~/workarea/llvm-r318298/dbuild/bin/llc .cfi_startproc # BB#0: # %entry pushq %rax .cfi_def_cfa_offset 16 movl $i, %eax cmpq %rax, %rsi ja .LBB0_5 # BB#1: movl $i, %eax .p2align 4, 0x90 .LBB0_3: # %while.body # =>This Inner Loop Header: Depth=1 movq (%rdi), %rcx movq %rcx, (%rsi) movq 8(%rdi), %rcx movq %rcx, (%rsi) addq $6, %rsi cmpq %rdx, %rsi jae .LBB0_4 # BB#2: # %while.cond # in Loop: Header=BB0_3 Depth=1 addq $6, %rdi cmpq %rax, %rsi jbe .LBB0_3 .LBB0_5: # %while.end callq _Z1fv .LBB0_4: # %return movq %rdx, %rax popq %rcx retq .Lfunc_end0: .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ .cfi_endproc call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not fall through when exiting the loop. The non-fall-through branch increases branch misses significantly and regresses the benchmark by 10%. Now a possible way to fix it is to duplicate basicblock .LBB0_3, just like what tail duplication does, but basicblock .LBB0_3 contains 7 instructions and it will introduce some cost of code size increase. Any suggestion to fix the issue are welcomed! -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/0fe56494/attachment.html>
Wei Mi via llvm-dev
2017-Dec-19 00:21 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:> Hi, > > Recently 10% performance regression on an important benchmark showed up > after we integrated https://reviews.llvm.org/rL318299. The analysis > showed that rL318299 triggered loop rotation on an multi exits loop, and > the loop rotation introduced code layout issue. The performance regression > is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to > illustrate the problem. a.ll was generated by rL318298 and b.ll was > generated by rL318299. >a.ll and b.ll are generated from the same function by different versions of compiler. a.s and b.s generated from a.ll and b.ll respectively showed where the performance difference comes from. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/94f6cc0d/attachment.html>
Chandler Carruth via llvm-dev
2017-Dec-19 01:01 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
What I can't figure out is why we don't get the same basic block layout for b.ll.... Specifically, I understand that we essentially have an iteration peeled out of the loop by loop rotation. But the result still should get the fancy layout that causes the hot conditional branch (away from the unreachable) to fall through into the loop header... On Mon, Dec 18, 2017 at 4:21 PM Wei Mi <wmi at google.com> wrote:> On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote: > >> Hi, >> >> Recently 10% performance regression on an important benchmark showed up >> after we integrated https://reviews.llvm.org/rL318299. The analysis >> showed that rL318299 triggered loop rotation on an multi exits loop, and >> the loop rotation introduced code layout issue. The performance regression >> is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to >> illustrate the problem. a.ll was generated by rL318298 and b.ll was >> generated by rL318299. >> > > a.ll and b.ll are generated from the same function by different versions > of compiler. a.s and b.s generated from a.ll and b.ll respectively showed > where the performance difference comes from. >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171219/bb40f009/attachment.html>
Xinliang David Li via llvm-dev
2017-Dec-19 01:46 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
The introduction of cleanup.cond block in b.ll without loop-rotation already makes the layout worse than a.ll. Without introducing cleanup.cond block, the layout out is entry->while.cond -> while.body->ret All the arrows are hot fall through edges which is good. With cleanup.cond introduced in b.ll, the layout without tailDup nor loop rotation looks like: entry->while.cond ->while.body->cleanup.cond, ret Note that now there is no fall through edge to 'ret' block and while.body now needs to explicitly branch to 'ret'. If loop rotation happens, we will have entry, cleanup.cond -> while.cond -> while.body->ret Not that there will be a hot branch from entry to while.cond. LLVM actually does both tail dup and loop rotation. And the layout looks like: entry --> while.cond.dup, cleanup.cond->while.cond -> while.body->ret this is better than the previous one, but there is still a hot branch from while.cond.dup to while.body introduced. David On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:> Hi, > > Recently 10% performance regression on an important benchmark showed up > after we integrated https://reviews.llvm.org/rL318299. The analysis > showed that rL318299 triggered loop rotation on an multi exits loop, and > the loop rotation introduced code layout issue. The performance regression > is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to > illustrate the problem. a.ll was generated by rL318298 and b.ll was > generated by rL318299. > > -------------------------- a.ll ---------------------------- > declare void @_Z1fv() local_unnamed_addr #2 > @i = global i8 0, align 1 > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr #3 { > entry: > br label %while.cond > > while.cond: ; preds = %while.body, > %entry > %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] > %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.end, label %while.body > > while.body: ; preds = %while.cond > %0 = bitcast i8* %d.addr.0 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.0 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > br i1 %cmp5, label %while.cond, label %return > > while.end: ; preds = %while.cond > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > > -------------------------- b.ll ---------------------------- > declare void @_Z1fv() local_unnamed_addr #2 > @i = global i8 0, align 1 > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr #3 { > entry: > br label %while.cond > > while.cond: ; preds = %cleanup.cont, > %entry > %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ] > %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.end, label %while.body > > while.body: ; preds = %while.cond > %0 = bitcast i8* %d.addr.0 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.0 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > br i1 %cmp5, label %cleanup.cont, label %return > > cleanup.cont: ; preds = %while.body > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 > br label %while.cond > > while.end: ; preds = %while.cond > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > The only difference between a.ll and b.ll is the basicblock cleanup.cont. > > -------------------------- a.ll after loop rotate, same as a.ll before > loop rotate ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll > > ; ModuleID = '<stdin>' > source_filename = "<stdin>" > > @i = global i8 0, align 1 > > declare void @_Z1fv() local_unnamed_addr > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr { > entry: > br label %while.cond > > while.cond: ; preds = %while.body, > %entry > %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] > %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.end, label %while.body > > while.body: ; preds = %while.cond > %0 = bitcast i8* %d.addr.0 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.0 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > br i1 %cmp5, label %while.cond, label %return > > while.end: ; preds = %while.cond > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > -------------------------- b.ll after loop rotate > ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll > > @i = global i8 0, align 1 > declare void @_Z1fv() local_unnamed_addr > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr { > entry: > %cmp1 = icmp ugt i8* %h, @i > br i1 %cmp1, label %while.end, label %while.body.lr.ph > > while.body.lr.ph: ; preds = %entry > br label %while.body > > while.cond: ; preds = %while.body > %h.addr.0 = phi i8* [ %add.ptr4, %while.body ] > %d.addr.0 = phi i8* [ %add.ptr3, %while.body ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body > > while.body: ; preds = % > while.body.lr.ph, %while.cond > %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, > %while.cond ] > %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, > %while.cond ] > %0 = bitcast i8* %d.addr.03 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.02 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6 > br i1 %cmp5, label %while.cond, label %return > > while.cond.while.end_crit_edge: ; preds = %while.cond > br label %while.end > > while.end: ; preds > %while.cond.while.end_crit_edge, %entry > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > a.ll and b.ll have different results after loop rotation because of > http://llvm.org/viewvc/llvm-project?view=revision&revision=181230 > <https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg> > > -------------------------- a.s generated from a.ll > ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll > |~/workarea/llvm-r318298/dbuild/bin/llc > > .cfi_startproc > # BB#0: # %entry > pushq %rax > .cfi_def_cfa_offset 16 > movl $i, %eax > .p2align 4, 0x90 > .LBB0_1: # %while.cond > # =>This Inner Loop Header: Depth=1 > cmpq %rax, %rsi > ja .LBB0_4 > # BB#2: # %while.body > # in Loop: Header=BB0_1 Depth=1 > movq (%rdi), %rcx > movq %rcx, (%rsi) > movq 8(%rdi), %rcx > movq %rcx, (%rsi) > addq $6, %rdi > addq $6, %rsi > cmpq %rdx, %rsi > jb .LBB0_1 > # BB#3: # %return > movq %rdx, %rax > popq %rcx > retq > .LBB0_4: # %while.end > callq _Z1fv > .Lfunc_end0: > .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ > .cfi_endproc > > call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s > will contain mostly fall through branches. > > -------------------------- b.s generated from b.ll > ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll > |~/workarea/llvm-r318298/dbuild/bin/llc > > .cfi_startproc > # BB#0: # %entry > pushq %rax > .cfi_def_cfa_offset 16 > movl $i, %eax > cmpq %rax, %rsi > ja .LBB0_5 > # BB#1: > movl $i, %eax > .p2align 4, 0x90 > .LBB0_3: # %while.body > # =>This Inner Loop Header: Depth=1 > movq (%rdi), %rcx > movq %rcx, (%rsi) > movq 8(%rdi), %rcx > movq %rcx, (%rsi) > addq $6, %rsi > cmpq %rdx, %rsi > jae .LBB0_4 > # BB#2: # %while.cond > # in Loop: Header=BB0_3 Depth=1 > addq $6, %rdi > cmpq %rax, %rsi > jbe .LBB0_3 > .LBB0_5: # %while.end > callq _Z1fv > .LBB0_4: # %return > movq %rdx, %rax > popq %rcx > retq > .Lfunc_end0: > .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ > .cfi_endproc > > call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not fall > through when exiting the loop. The non-fall-through branch increases branch > misses significantly and regresses the benchmark by 10%. > > Now a possible way to fix it is to duplicate basicblock .LBB0_3, just like > what tail duplication does, but basicblock .LBB0_3 contains 7 instructions > and it will introduce some cost of code size increase. > > Any suggestion to fix the issue are welcomed! >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/63053299/attachment-0001.html>
Chandler Carruth via llvm-dev
2017-Dec-19 02:03 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
On Mon, Dec 18, 2017 at 5:46 PM Xinliang David Li <davidxl at google.com> wrote:> The introduction of cleanup.cond block in b.ll without loop-rotation > already makes the layout worse than a.ll. > > > Without introducing cleanup.cond block, the layout out is > > entry->while.cond -> while.body->ret > > All the arrows are hot fall through edges which is good. > > With cleanup.cond introduced in b.ll, the layout without tailDup nor loop > rotation looks like: > > > entry->while.cond ->while.body->cleanup.cond, ret > > Note that now there is no fall through edge to 'ret' block and while.body > now needs to explicitly branch to 'ret'. If loop rotation happens, we > will have > > entry, cleanup.cond -> while.cond -> while.body->ret > > Not that there will be a hot branch from entry to while.cond. > > LLVM actually does both tail dup and loop rotation. And the layout looks > like: > > entry --> while.cond.dup, cleanup.cond->while.cond -> while.body->ret > > this is better than the previous one, but there is still a hot branch from > while.cond.dup to while.body introduced. >(just relaying a comment I made in person) This does seem like a less good layout, but as a consequence of it we do avoid one addition and comparison along the hot path. It's not obvious to me which is better: the better layout (with added instruction) or the minimal set of instructions with less good layout.> > David > On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote: > >> Hi, >> >> Recently 10% performance regression on an important benchmark showed up >> after we integrated https://reviews.llvm.org/rL318299. The analysis >> showed that rL318299 triggered loop rotation on an multi exits loop, and >> the loop rotation introduced code layout issue. The performance regression >> is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to >> illustrate the problem. a.ll was generated by rL318298 and b.ll was >> generated by rL318299. >> >> -------------------------- a.ll ---------------------------- >> declare void @_Z1fv() local_unnamed_addr #2 >> @i = global i8 0, align 1 >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr #3 { >> entry: >> br label %while.cond >> >> while.cond: ; preds = %while.body, >> %entry >> %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] >> %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.end, label %while.body >> >> while.body: ; preds = %while.cond >> %0 = bitcast i8* %d.addr.0 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.0 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> br i1 %cmp5, label %while.cond, label %return >> >> while.end: ; preds = %while.cond >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> >> -------------------------- b.ll ---------------------------- >> declare void @_Z1fv() local_unnamed_addr #2 >> @i = global i8 0, align 1 >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr #3 { >> entry: >> br label %while.cond >> >> while.cond: ; preds >> %cleanup.cont, %entry >> %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ] >> %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.end, label %while.body >> >> while.body: ; preds = %while.cond >> %0 = bitcast i8* %d.addr.0 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.0 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> br i1 %cmp5, label %cleanup.cont, label %return >> >> cleanup.cont: ; preds = %while.body >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 >> br label %while.cond >> >> while.end: ; preds = %while.cond >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> The only difference between a.ll and b.ll is the basicblock cleanup.cont. >> >> -------------------------- a.ll after loop rotate, same as a.ll before >> loop rotate ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll >> >> ; ModuleID = '<stdin>' >> source_filename = "<stdin>" >> >> @i = global i8 0, align 1 >> >> declare void @_Z1fv() local_unnamed_addr >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr { >> entry: >> br label %while.cond >> >> while.cond: ; preds = %while.body, >> %entry >> %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] >> %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.end, label %while.body >> >> while.body: ; preds = %while.cond >> %0 = bitcast i8* %d.addr.0 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.0 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> br i1 %cmp5, label %while.cond, label %return >> >> while.end: ; preds = %while.cond >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> -------------------------- b.ll after loop rotate >> ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll >> >> @i = global i8 0, align 1 >> declare void @_Z1fv() local_unnamed_addr >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr { >> entry: >> %cmp1 = icmp ugt i8* %h, @i >> br i1 %cmp1, label %while.end, label %while.body.lr.ph >> >> while.body.lr.ph: ; preds = %entry >> br label %while.body >> >> while.cond: ; preds = %while.body >> %h.addr.0 = phi i8* [ %add.ptr4, %while.body ] >> %d.addr.0 = phi i8* [ %add.ptr3, %while.body ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body >> >> while.body: ; preds = % >> while.body.lr.ph, %while.cond >> %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, >> %while.cond ] >> %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, >> %while.cond ] >> %0 = bitcast i8* %d.addr.03 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.02 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6 >> br i1 %cmp5, label %while.cond, label %return >> >> while.cond.while.end_crit_edge: ; preds = %while.cond >> br label %while.end >> >> while.end: ; preds >> %while.cond.while.end_crit_edge, %entry >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> a.ll and b.ll have different results after loop rotation because of >> http://llvm.org/viewvc/llvm-project?view=revision&revision=181230 >> <https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg> >> >> -------------------------- a.s generated from a.ll >> ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll >> |~/workarea/llvm-r318298/dbuild/bin/llc >> >> .cfi_startproc >> # BB#0: # %entry >> pushq %rax >> .cfi_def_cfa_offset 16 >> movl $i, %eax >> .p2align 4, 0x90 >> .LBB0_1: # %while.cond >> # =>This Inner Loop Header: >> Depth=1 >> cmpq %rax, %rsi >> ja .LBB0_4 >> # BB#2: # %while.body >> # in Loop: Header=BB0_1 Depth=1 >> movq (%rdi), %rcx >> movq %rcx, (%rsi) >> movq 8(%rdi), %rcx >> movq %rcx, (%rsi) >> addq $6, %rdi >> addq $6, %rsi >> cmpq %rdx, %rsi >> jb .LBB0_1 >> # BB#3: # %return >> movq %rdx, %rax >> popq %rcx >> retq >> .LBB0_4: # %while.end >> callq _Z1fv >> .Lfunc_end0: >> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ >> .cfi_endproc >> >> call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s >> will contain mostly fall through branches. >> >> -------------------------- b.s generated from b.ll >> ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll >> |~/workarea/llvm-r318298/dbuild/bin/llc >> >> .cfi_startproc >> # BB#0: # %entry >> pushq %rax >> .cfi_def_cfa_offset 16 >> movl $i, %eax >> cmpq %rax, %rsi >> ja .LBB0_5 >> # BB#1: >> movl $i, %eax >> .p2align 4, 0x90 >> .LBB0_3: # %while.body >> # =>This Inner Loop Header: >> Depth=1 >> movq (%rdi), %rcx >> movq %rcx, (%rsi) >> movq 8(%rdi), %rcx >> movq %rcx, (%rsi) >> addq $6, %rsi >> cmpq %rdx, %rsi >> jae .LBB0_4 >> # BB#2: # %while.cond >> # in Loop: Header=BB0_3 Depth=1 >> addq $6, %rdi >> cmpq %rax, %rsi >> jbe .LBB0_3 >> .LBB0_5: # %while.end >> callq _Z1fv >> .LBB0_4: # %return >> movq %rdx, %rax >> popq %rcx >> retq >> .Lfunc_end0: >> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ >> .cfi_endproc >> >> call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not >> fall through when exiting the loop. The non-fall-through branch increases >> branch misses significantly and regresses the benchmark by 10%. >> >> Now a possible way to fix it is to duplicate basicblock .LBB0_3, just >> like what tail duplication does, but basicblock .LBB0_3 contains 7 >> instructions and it will introduce some cost of code size increase. >> >> Any suggestion to fix the issue are welcomed! >> >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171219/83d74bb9/attachment.html>