Wei Mi via llvm-dev
2017-Dec-19 00:14 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
Hi,
Recently 10% performance regression on an important benchmark showed up
after we integrated https://reviews.llvm.org/rL318299. The analysis showed
that rL318299 triggered loop rotation on an multi exits loop, and the loop
rotation introduced code layout issue. The performance regression is a
side-effect of rL318299. I got two testcases a.ll and b.ll attached to
illustrate the problem. a.ll was generated by rL318298 and b.ll was
generated by rL318299.
-------------------------- a.ll ----------------------------
declare void @_Z1fv() local_unnamed_addr #2
@i = global i8 0, align 1
define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr #3 {
entry:
br label %while.cond
while.cond: ; preds = %while.body,
%entry
%h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
%d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
%cmp = icmp ugt i8* %h.addr.0, @i
br i1 %cmp, label %while.end, label %while.body
while.body: ; preds = %while.cond
%0 = bitcast i8* %d.addr.0 to i64*
%1 = load i64, i64* %0, align 1
%2 = bitcast i8* %h.addr.0 to i64*
store i64 %1, i64* %2, align 1
%add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
%3 = bitcast i8* %add.ptr to i64*
%4 = load i64, i64* %3, align 1
store i64 %4, i64* %2, align 1
%add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
%add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
%cmp5 = icmp ult i8* %add.ptr4, %p3
br i1 %cmp5, label %while.cond, label %return
while.end: ; preds = %while.cond
tail call void @_Z1fv()
unreachable
return: ; preds = %while.body
ret i8* %p3
}
-------------------------- b.ll ----------------------------
declare void @_Z1fv() local_unnamed_addr #2
@i = global i8 0, align 1
define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr #3 {
entry:
br label %while.cond
while.cond: ; preds = %cleanup.cont,
%entry
%h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ]
%d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ]
%cmp = icmp ugt i8* %h.addr.0, @i
br i1 %cmp, label %while.end, label %while.body
while.body: ; preds = %while.cond
%0 = bitcast i8* %d.addr.0 to i64*
%1 = load i64, i64* %0, align 1
%2 = bitcast i8* %h.addr.0 to i64*
store i64 %1, i64* %2, align 1
%add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
%3 = bitcast i8* %add.ptr to i64*
%4 = load i64, i64* %3, align 1
store i64 %4, i64* %2, align 1
%add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
%cmp5 = icmp ult i8* %add.ptr4, %p3
br i1 %cmp5, label %cleanup.cont, label %return
cleanup.cont: ; preds = %while.body
%add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
br label %while.cond
while.end: ; preds = %while.cond
tail call void @_Z1fv()
unreachable
return: ; preds = %while.body
ret i8* %p3
}
The only difference between a.ll and b.ll is the basicblock cleanup.cont.
-------------------------- a.ll after loop rotate, same as a.ll before loop
rotate ----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
; ModuleID = '<stdin>'
source_filename = "<stdin>"
@i = global i8 0, align 1
declare void @_Z1fv() local_unnamed_addr
define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr {
entry:
br label %while.cond
while.cond: ; preds = %while.body,
%entry
%h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
%d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
%cmp = icmp ugt i8* %h.addr.0, @i
br i1 %cmp, label %while.end, label %while.body
while.body: ; preds = %while.cond
%0 = bitcast i8* %d.addr.0 to i64*
%1 = load i64, i64* %0, align 1
%2 = bitcast i8* %h.addr.0 to i64*
store i64 %1, i64* %2, align 1
%add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
%3 = bitcast i8* %add.ptr to i64*
%4 = load i64, i64* %3, align 1
store i64 %4, i64* %2, align 1
%add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
%add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
%cmp5 = icmp ult i8* %add.ptr4, %p3
br i1 %cmp5, label %while.cond, label %return
while.end: ; preds = %while.cond
tail call void @_Z1fv()
unreachable
return: ; preds = %while.body
ret i8* %p3
}
-------------------------- b.ll after loop rotate
----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
@i = global i8 0, align 1
declare void @_Z1fv() local_unnamed_addr
define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr {
entry:
%cmp1 = icmp ugt i8* %h, @i
br i1 %cmp1, label %while.end, label %while.body.lr.ph
while.body.lr.ph: ; preds = %entry
br label %while.body
while.cond: ; preds = %while.body
%h.addr.0 = phi i8* [ %add.ptr4, %while.body ]
%d.addr.0 = phi i8* [ %add.ptr3, %while.body ]
%cmp = icmp ugt i8* %h.addr.0, @i
br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body
while.body: ; preds = %
while.body.lr.ph, %while.cond
%d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, %while.cond ]
%h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, %while.cond ]
%0 = bitcast i8* %d.addr.03 to i64*
%1 = load i64, i64* %0, align 1
%2 = bitcast i8* %h.addr.02 to i64*
store i64 %1, i64* %2, align 1
%add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8
%3 = bitcast i8* %add.ptr to i64*
%4 = load i64, i64* %3, align 1
store i64 %4, i64* %2, align 1
%add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6
%cmp5 = icmp ult i8* %add.ptr4, %p3
%add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6
br i1 %cmp5, label %while.cond, label %return
while.cond.while.end_crit_edge: ; preds = %while.cond
br label %while.end
while.end: ; preds
%while.cond.while.end_crit_edge, %entry
tail call void @_Z1fv()
unreachable
return: ; preds = %while.body
ret i8* %p3
}
a.ll and b.ll have different results after loop rotation because of
http://llvm.org/viewvc/llvm-project?view=revision&revision=181230
<https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg>
-------------------------- a.s generated from a.ll
----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
|~/workarea/llvm-r318298/dbuild/bin/llc
.cfi_startproc
# BB#0: # %entry
pushq %rax
.cfi_def_cfa_offset 16
movl $i, %eax
.p2align 4, 0x90
.LBB0_1: # %while.cond
# =>This Inner Loop Header: Depth=1
cmpq %rax, %rsi
ja .LBB0_4
# BB#2: # %while.body
# in Loop: Header=BB0_1 Depth=1
movq (%rdi), %rcx
movq %rcx, (%rsi)
movq 8(%rdi), %rcx
movq %rcx, (%rsi)
addq $6, %rdi
addq $6, %rsi
cmpq %rdx, %rsi
jb .LBB0_1
# BB#3: # %return
movq %rdx, %rax
popq %rcx
retq
.LBB0_4: # %while.end
callq _Z1fv
.Lfunc_end0:
.size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
.cfi_endproc
call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s will
contain mostly fall through branches.
-------------------------- b.s generated from b.ll
----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
|~/workarea/llvm-r318298/dbuild/bin/llc
.cfi_startproc
# BB#0: # %entry
pushq %rax
.cfi_def_cfa_offset 16
movl $i, %eax
cmpq %rax, %rsi
ja .LBB0_5
# BB#1:
movl $i, %eax
.p2align 4, 0x90
.LBB0_3: # %while.body
# =>This Inner Loop Header: Depth=1
movq (%rdi), %rcx
movq %rcx, (%rsi)
movq 8(%rdi), %rcx
movq %rcx, (%rsi)
addq $6, %rsi
cmpq %rdx, %rsi
jae .LBB0_4
# BB#2: # %while.cond
# in Loop: Header=BB0_3 Depth=1
addq $6, %rdi
cmpq %rax, %rsi
jbe .LBB0_3
.LBB0_5: # %while.end
callq _Z1fv
.LBB0_4: # %return
movq %rdx, %rax
popq %rcx
retq
.Lfunc_end0:
.size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
.cfi_endproc
call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not
fall
through when exiting the loop. The non-fall-through branch increases branch
misses significantly and regresses the benchmark by 10%.
Now a possible way to fix it is to duplicate basicblock .LBB0_3, just like
what tail duplication does, but basicblock .LBB0_3 contains 7 instructions
and it will introduce some cost of code size increase.
Any suggestion to fix the issue are welcomed!
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/0fe56494/attachment.html>
Wei Mi via llvm-dev
2017-Dec-19 00:21 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:> Hi, > > Recently 10% performance regression on an important benchmark showed up > after we integrated https://reviews.llvm.org/rL318299. The analysis > showed that rL318299 triggered loop rotation on an multi exits loop, and > the loop rotation introduced code layout issue. The performance regression > is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to > illustrate the problem. a.ll was generated by rL318298 and b.ll was > generated by rL318299. >a.ll and b.ll are generated from the same function by different versions of compiler. a.s and b.s generated from a.ll and b.ll respectively showed where the performance difference comes from. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/94f6cc0d/attachment.html>
Chandler Carruth via llvm-dev
2017-Dec-19 01:01 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
What I can't figure out is why we don't get the same basic block layout for b.ll.... Specifically, I understand that we essentially have an iteration peeled out of the loop by loop rotation. But the result still should get the fancy layout that causes the hot conditional branch (away from the unreachable) to fall through into the loop header... On Mon, Dec 18, 2017 at 4:21 PM Wei Mi <wmi at google.com> wrote:> On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote: > >> Hi, >> >> Recently 10% performance regression on an important benchmark showed up >> after we integrated https://reviews.llvm.org/rL318299. The analysis >> showed that rL318299 triggered loop rotation on an multi exits loop, and >> the loop rotation introduced code layout issue. The performance regression >> is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to >> illustrate the problem. a.ll was generated by rL318298 and b.ll was >> generated by rL318299. >> > > a.ll and b.ll are generated from the same function by different versions > of compiler. a.s and b.s generated from a.ll and b.ll respectively showed > where the performance difference comes from. >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171219/bb40f009/attachment.html>
Xinliang David Li via llvm-dev
2017-Dec-19 01:46 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
The introduction of cleanup.cond block in b.ll without loop-rotation already makes the layout worse than a.ll. Without introducing cleanup.cond block, the layout out is entry->while.cond -> while.body->ret All the arrows are hot fall through edges which is good. With cleanup.cond introduced in b.ll, the layout without tailDup nor loop rotation looks like: entry->while.cond ->while.body->cleanup.cond, ret Note that now there is no fall through edge to 'ret' block and while.body now needs to explicitly branch to 'ret'. If loop rotation happens, we will have entry, cleanup.cond -> while.cond -> while.body->ret Not that there will be a hot branch from entry to while.cond. LLVM actually does both tail dup and loop rotation. And the layout looks like: entry --> while.cond.dup, cleanup.cond->while.cond -> while.body->ret this is better than the previous one, but there is still a hot branch from while.cond.dup to while.body introduced. David On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:> Hi, > > Recently 10% performance regression on an important benchmark showed up > after we integrated https://reviews.llvm.org/rL318299. The analysis > showed that rL318299 triggered loop rotation on an multi exits loop, and > the loop rotation introduced code layout issue. The performance regression > is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to > illustrate the problem. a.ll was generated by rL318298 and b.ll was > generated by rL318299. > > -------------------------- a.ll ---------------------------- > declare void @_Z1fv() local_unnamed_addr #2 > @i = global i8 0, align 1 > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr #3 { > entry: > br label %while.cond > > while.cond: ; preds = %while.body, > %entry > %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] > %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.end, label %while.body > > while.body: ; preds = %while.cond > %0 = bitcast i8* %d.addr.0 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.0 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > br i1 %cmp5, label %while.cond, label %return > > while.end: ; preds = %while.cond > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > > -------------------------- b.ll ---------------------------- > declare void @_Z1fv() local_unnamed_addr #2 > @i = global i8 0, align 1 > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr #3 { > entry: > br label %while.cond > > while.cond: ; preds = %cleanup.cont, > %entry > %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ] > %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.end, label %while.body > > while.body: ; preds = %while.cond > %0 = bitcast i8* %d.addr.0 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.0 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > br i1 %cmp5, label %cleanup.cont, label %return > > cleanup.cont: ; preds = %while.body > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 > br label %while.cond > > while.end: ; preds = %while.cond > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > The only difference between a.ll and b.ll is the basicblock cleanup.cont. > > -------------------------- a.ll after loop rotate, same as a.ll before > loop rotate ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll > > ; ModuleID = '<stdin>' > source_filename = "<stdin>" > > @i = global i8 0, align 1 > > declare void @_Z1fv() local_unnamed_addr > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr { > entry: > br label %while.cond > > while.cond: ; preds = %while.body, > %entry > %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] > %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.end, label %while.body > > while.body: ; preds = %while.cond > %0 = bitcast i8* %d.addr.0 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.0 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > br i1 %cmp5, label %while.cond, label %return > > while.end: ; preds = %while.cond > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > -------------------------- b.ll after loop rotate > ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll > > @i = global i8 0, align 1 > declare void @_Z1fv() local_unnamed_addr > > define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone > returned %p3) local_unnamed_addr { > entry: > %cmp1 = icmp ugt i8* %h, @i > br i1 %cmp1, label %while.end, label %while.body.lr.ph > > while.body.lr.ph: ; preds = %entry > br label %while.body > > while.cond: ; preds = %while.body > %h.addr.0 = phi i8* [ %add.ptr4, %while.body ] > %d.addr.0 = phi i8* [ %add.ptr3, %while.body ] > %cmp = icmp ugt i8* %h.addr.0, @i > br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body > > while.body: ; preds = % > while.body.lr.ph, %while.cond > %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, > %while.cond ] > %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, > %while.cond ] > %0 = bitcast i8* %d.addr.03 to i64* > %1 = load i64, i64* %0, align 1 > %2 = bitcast i8* %h.addr.02 to i64* > store i64 %1, i64* %2, align 1 > %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8 > %3 = bitcast i8* %add.ptr to i64* > %4 = load i64, i64* %3, align 1 > store i64 %4, i64* %2, align 1 > %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6 > %cmp5 = icmp ult i8* %add.ptr4, %p3 > %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6 > br i1 %cmp5, label %while.cond, label %return > > while.cond.while.end_crit_edge: ; preds = %while.cond > br label %while.end > > while.end: ; preds > %while.cond.while.end_crit_edge, %entry > tail call void @_Z1fv() > unreachable > > return: ; preds = %while.body > ret i8* %p3 > } > > a.ll and b.ll have different results after loop rotation because of > http://llvm.org/viewvc/llvm-project?view=revision&revision=181230 > <https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg> > > -------------------------- a.s generated from a.ll > ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll > |~/workarea/llvm-r318298/dbuild/bin/llc > > .cfi_startproc > # BB#0: # %entry > pushq %rax > .cfi_def_cfa_offset 16 > movl $i, %eax > .p2align 4, 0x90 > .LBB0_1: # %while.cond > # =>This Inner Loop Header: Depth=1 > cmpq %rax, %rsi > ja .LBB0_4 > # BB#2: # %while.body > # in Loop: Header=BB0_1 Depth=1 > movq (%rdi), %rcx > movq %rcx, (%rsi) > movq 8(%rdi), %rcx > movq %rcx, (%rsi) > addq $6, %rdi > addq $6, %rsi > cmpq %rdx, %rsi > jb .LBB0_1 > # BB#3: # %return > movq %rdx, %rax > popq %rcx > retq > .LBB0_4: # %while.end > callq _Z1fv > .Lfunc_end0: > .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ > .cfi_endproc > > call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s > will contain mostly fall through branches. > > -------------------------- b.s generated from b.ll > ---------------------------- > ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll > |~/workarea/llvm-r318298/dbuild/bin/llc > > .cfi_startproc > # BB#0: # %entry > pushq %rax > .cfi_def_cfa_offset 16 > movl $i, %eax > cmpq %rax, %rsi > ja .LBB0_5 > # BB#1: > movl $i, %eax > .p2align 4, 0x90 > .LBB0_3: # %while.body > # =>This Inner Loop Header: Depth=1 > movq (%rdi), %rcx > movq %rcx, (%rsi) > movq 8(%rdi), %rcx > movq %rcx, (%rsi) > addq $6, %rsi > cmpq %rdx, %rsi > jae .LBB0_4 > # BB#2: # %while.cond > # in Loop: Header=BB0_3 Depth=1 > addq $6, %rdi > cmpq %rax, %rsi > jbe .LBB0_3 > .LBB0_5: # %while.end > callq _Z1fv > .LBB0_4: # %return > movq %rdx, %rax > popq %rcx > retq > .Lfunc_end0: > .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ > .cfi_endproc > > call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not fall > through when exiting the loop. The non-fall-through branch increases branch > misses significantly and regresses the benchmark by 10%. > > Now a possible way to fix it is to duplicate basicblock .LBB0_3, just like > what tail duplication does, but basicblock .LBB0_3 contains 7 instructions > and it will introduce some cost of code size increase. > > Any suggestion to fix the issue are welcomed! >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/63053299/attachment-0001.html>
Chandler Carruth via llvm-dev
2017-Dec-19 02:03 UTC
[llvm-dev] A code layout related side-effect introduced by rL318299
On Mon, Dec 18, 2017 at 5:46 PM Xinliang David Li <davidxl at google.com> wrote:> The introduction of cleanup.cond block in b.ll without loop-rotation > already makes the layout worse than a.ll. > > > Without introducing cleanup.cond block, the layout out is > > entry->while.cond -> while.body->ret > > All the arrows are hot fall through edges which is good. > > With cleanup.cond introduced in b.ll, the layout without tailDup nor loop > rotation looks like: > > > entry->while.cond ->while.body->cleanup.cond, ret > > Note that now there is no fall through edge to 'ret' block and while.body > now needs to explicitly branch to 'ret'. If loop rotation happens, we > will have > > entry, cleanup.cond -> while.cond -> while.body->ret > > Not that there will be a hot branch from entry to while.cond. > > LLVM actually does both tail dup and loop rotation. And the layout looks > like: > > entry --> while.cond.dup, cleanup.cond->while.cond -> while.body->ret > > this is better than the previous one, but there is still a hot branch from > while.cond.dup to while.body introduced. >(just relaying a comment I made in person) This does seem like a less good layout, but as a consequence of it we do avoid one addition and comparison along the hot path. It's not obvious to me which is better: the better layout (with added instruction) or the minimal set of instructions with less good layout.> > David > On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote: > >> Hi, >> >> Recently 10% performance regression on an important benchmark showed up >> after we integrated https://reviews.llvm.org/rL318299. The analysis >> showed that rL318299 triggered loop rotation on an multi exits loop, and >> the loop rotation introduced code layout issue. The performance regression >> is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to >> illustrate the problem. a.ll was generated by rL318298 and b.ll was >> generated by rL318299. >> >> -------------------------- a.ll ---------------------------- >> declare void @_Z1fv() local_unnamed_addr #2 >> @i = global i8 0, align 1 >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr #3 { >> entry: >> br label %while.cond >> >> while.cond: ; preds = %while.body, >> %entry >> %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] >> %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.end, label %while.body >> >> while.body: ; preds = %while.cond >> %0 = bitcast i8* %d.addr.0 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.0 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> br i1 %cmp5, label %while.cond, label %return >> >> while.end: ; preds = %while.cond >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> >> -------------------------- b.ll ---------------------------- >> declare void @_Z1fv() local_unnamed_addr #2 >> @i = global i8 0, align 1 >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr #3 { >> entry: >> br label %while.cond >> >> while.cond: ; preds >> %cleanup.cont, %entry >> %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ] >> %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.end, label %while.body >> >> while.body: ; preds = %while.cond >> %0 = bitcast i8* %d.addr.0 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.0 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> br i1 %cmp5, label %cleanup.cont, label %return >> >> cleanup.cont: ; preds = %while.body >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 >> br label %while.cond >> >> while.end: ; preds = %while.cond >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> The only difference between a.ll and b.ll is the basicblock cleanup.cont. >> >> -------------------------- a.ll after loop rotate, same as a.ll before >> loop rotate ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll >> >> ; ModuleID = '<stdin>' >> source_filename = "<stdin>" >> >> @i = global i8 0, align 1 >> >> declare void @_Z1fv() local_unnamed_addr >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr { >> entry: >> br label %while.cond >> >> while.cond: ; preds = %while.body, >> %entry >> %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ] >> %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.end, label %while.body >> >> while.body: ; preds = %while.cond >> %0 = bitcast i8* %d.addr.0 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.0 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> br i1 %cmp5, label %while.cond, label %return >> >> while.end: ; preds = %while.cond >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> -------------------------- b.ll after loop rotate >> ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll >> >> @i = global i8 0, align 1 >> declare void @_Z1fv() local_unnamed_addr >> >> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone >> returned %p3) local_unnamed_addr { >> entry: >> %cmp1 = icmp ugt i8* %h, @i >> br i1 %cmp1, label %while.end, label %while.body.lr.ph >> >> while.body.lr.ph: ; preds = %entry >> br label %while.body >> >> while.cond: ; preds = %while.body >> %h.addr.0 = phi i8* [ %add.ptr4, %while.body ] >> %d.addr.0 = phi i8* [ %add.ptr3, %while.body ] >> %cmp = icmp ugt i8* %h.addr.0, @i >> br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body >> >> while.body: ; preds = % >> while.body.lr.ph, %while.cond >> %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, >> %while.cond ] >> %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, >> %while.cond ] >> %0 = bitcast i8* %d.addr.03 to i64* >> %1 = load i64, i64* %0, align 1 >> %2 = bitcast i8* %h.addr.02 to i64* >> store i64 %1, i64* %2, align 1 >> %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8 >> %3 = bitcast i8* %add.ptr to i64* >> %4 = load i64, i64* %3, align 1 >> store i64 %4, i64* %2, align 1 >> %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6 >> %cmp5 = icmp ult i8* %add.ptr4, %p3 >> %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6 >> br i1 %cmp5, label %while.cond, label %return >> >> while.cond.while.end_crit_edge: ; preds = %while.cond >> br label %while.end >> >> while.end: ; preds >> %while.cond.while.end_crit_edge, %entry >> tail call void @_Z1fv() >> unreachable >> >> return: ; preds = %while.body >> ret i8* %p3 >> } >> >> a.ll and b.ll have different results after loop rotation because of >> http://llvm.org/viewvc/llvm-project?view=revision&revision=181230 >> <https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg> >> >> -------------------------- a.s generated from a.ll >> ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll >> |~/workarea/llvm-r318298/dbuild/bin/llc >> >> .cfi_startproc >> # BB#0: # %entry >> pushq %rax >> .cfi_def_cfa_offset 16 >> movl $i, %eax >> .p2align 4, 0x90 >> .LBB0_1: # %while.cond >> # =>This Inner Loop Header: >> Depth=1 >> cmpq %rax, %rsi >> ja .LBB0_4 >> # BB#2: # %while.body >> # in Loop: Header=BB0_1 Depth=1 >> movq (%rdi), %rcx >> movq %rcx, (%rsi) >> movq 8(%rdi), %rcx >> movq %rcx, (%rsi) >> addq $6, %rdi >> addq $6, %rsi >> cmpq %rdx, %rsi >> jb .LBB0_1 >> # BB#3: # %return >> movq %rdx, %rax >> popq %rcx >> retq >> .LBB0_4: # %while.end >> callq _Z1fv >> .Lfunc_end0: >> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ >> .cfi_endproc >> >> call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s >> will contain mostly fall through branches. >> >> -------------------------- b.s generated from b.ll >> ---------------------------- >> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll >> |~/workarea/llvm-r318298/dbuild/bin/llc >> >> .cfi_startproc >> # BB#0: # %entry >> pushq %rax >> .cfi_def_cfa_offset 16 >> movl $i, %eax >> cmpq %rax, %rsi >> ja .LBB0_5 >> # BB#1: >> movl $i, %eax >> .p2align 4, 0x90 >> .LBB0_3: # %while.body >> # =>This Inner Loop Header: >> Depth=1 >> movq (%rdi), %rcx >> movq %rcx, (%rsi) >> movq 8(%rdi), %rcx >> movq %rcx, (%rsi) >> addq $6, %rsi >> cmpq %rdx, %rsi >> jae .LBB0_4 >> # BB#2: # %while.cond >> # in Loop: Header=BB0_3 Depth=1 >> addq $6, %rdi >> cmpq %rax, %rsi >> jbe .LBB0_3 >> .LBB0_5: # %while.end >> callq _Z1fv >> .LBB0_4: # %return >> movq %rdx, %rax >> popq %rcx >> retq >> .Lfunc_end0: >> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_ >> .cfi_endproc >> >> call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not >> fall through when exiting the loop. The non-fall-through branch increases >> branch misses significantly and regresses the benchmark by 10%. >> >> Now a possible way to fix it is to duplicate basicblock .LBB0_3, just >> like what tail duplication does, but basicblock .LBB0_3 contains 7 >> instructions and it will introduce some cost of code size increase. >> >> Any suggestion to fix the issue are welcomed! >> >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171219/83d74bb9/attachment.html>