thr3ads.net - llvm dev - [llvm-dev] A code layout related side-effect introduced by rL318299 [Dec 2017]

If this information is useful, please help other people find it:
Share via:

Wei Mi via llvm-dev

2017-Dec-19 00:14 UTC

[llvm-dev] A code layout related side-effect introduced by rL318299

Hi,

Recently 10% performance regression on an important benchmark showed up
after we integrated https://reviews.llvm.org/rL318299. The analysis showed
that rL318299 triggered loop rotation on an multi exits loop, and the loop
rotation introduced code layout issue. The performance regression is a
side-effect of rL318299. I got two testcases a.ll and b.ll attached to
illustrate the problem. a.ll was generated by rL318298 and b.ll was
generated by rL318299.

-------------------------- a.ll ----------------------------
declare void @_Z1fv() local_unnamed_addr #2
@i = global i8 0, align 1

define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr #3 {
entry:
  br label %while.cond

while.cond:                                       ; preds = %while.body,
%entry
  %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
  %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
  %cmp = icmp ugt i8* %h.addr.0, @i
  br i1 %cmp, label %while.end, label %while.body

while.body:                                       ; preds = %while.cond
  %0 = bitcast i8* %d.addr.0 to i64*
  %1 = load i64, i64* %0, align 1
  %2 = bitcast i8* %h.addr.0 to i64*
  store i64 %1, i64* %2, align 1
  %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
  %3 = bitcast i8* %add.ptr to i64*
  %4 = load i64, i64* %3, align 1
  store i64 %4, i64* %2, align 1
  %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
  %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
  %cmp5 = icmp ult i8* %add.ptr4, %p3
  br i1 %cmp5, label %while.cond, label %return

while.end:                                        ; preds = %while.cond
  tail call void @_Z1fv()
  unreachable

return:                                           ; preds = %while.body
  ret i8* %p3
}


-------------------------- b.ll ----------------------------
declare void @_Z1fv() local_unnamed_addr #2
@i = global i8 0, align 1

define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr #3 {
entry:
  br label %while.cond

while.cond:                                       ; preds = %cleanup.cont,
%entry
  %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ]
  %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ]
  %cmp = icmp ugt i8* %h.addr.0, @i
  br i1 %cmp, label %while.end, label %while.body

while.body:                                       ; preds = %while.cond
  %0 = bitcast i8* %d.addr.0 to i64*
  %1 = load i64, i64* %0, align 1
  %2 = bitcast i8* %h.addr.0 to i64*
  store i64 %1, i64* %2, align 1
  %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
  %3 = bitcast i8* %add.ptr to i64*
  %4 = load i64, i64* %3, align 1
  store i64 %4, i64* %2, align 1
  %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
  %cmp5 = icmp ult i8* %add.ptr4, %p3
  br i1 %cmp5, label %cleanup.cont, label %return

cleanup.cont:                                     ; preds = %while.body
  %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
  br label %while.cond

while.end:                                        ; preds = %while.cond
  tail call void @_Z1fv()
  unreachable

return:                                           ; preds = %while.body
  ret i8* %p3
}

The only difference between a.ll and b.ll is the basicblock cleanup.cont.

-------------------------- a.ll after loop rotate, same as a.ll before loop
rotate ----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll

; ModuleID = '<stdin>'
source_filename = "<stdin>"

@i = global i8 0, align 1

declare void @_Z1fv() local_unnamed_addr

define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr {
entry:
  br label %while.cond

while.cond:                                       ; preds = %while.body,
%entry
  %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
  %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
  %cmp = icmp ugt i8* %h.addr.0, @i
  br i1 %cmp, label %while.end, label %while.body

while.body:                                       ; preds = %while.cond
  %0 = bitcast i8* %d.addr.0 to i64*
  %1 = load i64, i64* %0, align 1
  %2 = bitcast i8* %h.addr.0 to i64*
  store i64 %1, i64* %2, align 1
  %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
  %3 = bitcast i8* %add.ptr to i64*
  %4 = load i64, i64* %3, align 1
  store i64 %4, i64* %2, align 1
  %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
  %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
  %cmp5 = icmp ult i8* %add.ptr4, %p3
  br i1 %cmp5, label %while.cond, label %return

while.end:                                        ; preds = %while.cond
  tail call void @_Z1fv()
  unreachable

return:                                           ; preds = %while.body
  ret i8* %p3
}

-------------------------- b.ll after loop rotate
----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll

@i = global i8 0, align 1
declare void @_Z1fv() local_unnamed_addr

define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
returned %p3) local_unnamed_addr {
entry:
  %cmp1 = icmp ugt i8* %h, @i
  br i1 %cmp1, label %while.end, label %while.body.lr.ph

while.body.lr.ph:                                 ; preds = %entry
  br label %while.body

while.cond:                                       ; preds = %while.body
  %h.addr.0 = phi i8* [ %add.ptr4, %while.body ]
  %d.addr.0 = phi i8* [ %add.ptr3, %while.body ]
  %cmp = icmp ugt i8* %h.addr.0, @i
  br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body

while.body:                                       ; preds = %
while.body.lr.ph, %while.cond
  %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0, %while.cond ]
  %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0, %while.cond ]
  %0 = bitcast i8* %d.addr.03 to i64*
  %1 = load i64, i64* %0, align 1
  %2 = bitcast i8* %h.addr.02 to i64*
  store i64 %1, i64* %2, align 1
  %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8
  %3 = bitcast i8* %add.ptr to i64*
  %4 = load i64, i64* %3, align 1
  store i64 %4, i64* %2, align 1
  %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6
  %cmp5 = icmp ult i8* %add.ptr4, %p3
  %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6
  br i1 %cmp5, label %while.cond, label %return

while.cond.while.end_crit_edge:                   ; preds = %while.cond
  br label %while.end

while.end:                                        ; preds
%while.cond.while.end_crit_edge, %entry
  tail call void @_Z1fv()
  unreachable

return:                                           ; preds = %while.body
  ret i8* %p3
}

a.ll and b.ll have different results after loop rotation because of
http://llvm.org/viewvc/llvm-project?view=revision&revision=181230
<https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg>

-------------------------- a.s generated from a.ll
----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
|~/workarea/llvm-r318298/dbuild/bin/llc

.cfi_startproc
# BB#0:                                 # %entry
pushq %rax
.cfi_def_cfa_offset 16
movl $i, %eax
.p2align 4, 0x90
.LBB0_1:                                # %while.cond
                                        # =>This Inner Loop Header: Depth=1
cmpq %rax, %rsi
ja .LBB0_4
# BB#2:                                 # %while.body
                                        #   in Loop: Header=BB0_1 Depth=1
movq (%rdi), %rcx
movq %rcx, (%rsi)
movq 8(%rdi), %rcx
movq %rcx, (%rsi)
addq $6, %rdi
addq $6, %rsi
cmpq %rdx, %rsi
jb .LBB0_1
# BB#3:                                 # %return
movq %rdx, %rax
popq %rcx
retq
.LBB0_4:                                # %while.end
callq _Z1fv
.Lfunc_end0:
.size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
.cfi_endproc

call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s will
contain mostly fall through branches.

-------------------------- b.s generated from b.ll
----------------------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
|~/workarea/llvm-r318298/dbuild/bin/llc

.cfi_startproc
# BB#0:                                 # %entry
pushq %rax
.cfi_def_cfa_offset 16
movl $i, %eax
cmpq %rax, %rsi
ja .LBB0_5
# BB#1:
movl $i, %eax
.p2align 4, 0x90
.LBB0_3:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
movq (%rdi), %rcx
movq %rcx, (%rsi)
movq 8(%rdi), %rcx
movq %rcx, (%rsi)
addq $6, %rsi
cmpq %rdx, %rsi
jae .LBB0_4
# BB#2:                                 # %while.cond
                                        #   in Loop: Header=BB0_3 Depth=1
addq $6, %rdi
cmpq %rax, %rsi
jbe .LBB0_3
.LBB0_5:                                # %while.end
callq _Z1fv
.LBB0_4:                                # %return
movq %rdx, %rax
popq %rcx
retq
.Lfunc_end0:
.size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
.cfi_endproc

call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will not
fall
through when exiting the loop. The non-fall-through branch increases branch
misses significantly and regresses the benchmark by 10%.

Now a possible way to fix it is to duplicate basicblock .LBB0_3, just like
what tail duplication does, but basicblock .LBB0_3 contains 7 instructions
and it will introduce some cost of code size increase.

Any suggestion to fix the issue are welcomed!
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/0fe56494/attachment.html>

Wei Mi via llvm-dev

2017-Dec-19 00:21 UTC

head link

[llvm-dev] A code layout related side-effect introduced by rL318299

On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:
> Hi,
>
> Recently 10% performance regression on an important benchmark showed up
> after we integrated https://reviews.llvm.org/rL318299. The analysis
> showed that rL318299 triggered loop rotation on an multi exits loop, and
> the loop rotation introduced code layout issue. The performance regression
> is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to
> illustrate the problem. a.ll was generated by rL318298 and b.ll was
> generated by rL318299.
>
a.ll and b.ll are generated from the same function by different versions of
compiler. a.s and b.s generated from a.ll and b.ll respectively showed
where the performance difference comes from.
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/94f6cc0d/attachment.html>

Chandler Carruth via llvm-dev

2017-Dec-19 01:01 UTC

head link

[llvm-dev] A code layout related side-effect introduced by rL318299

What I can't figure out is why we don't get the same basic block layout
for
b.ll....

Specifically, I understand that we essentially have an iteration peeled out
of the loop by loop rotation. But the result still should get the fancy
layout that causes the hot conditional branch (away from the unreachable)
to fall through into the loop header...

On Mon, Dec 18, 2017 at 4:21 PM Wei Mi <wmi at google.com> wrote:
> On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:
>
>> Hi,
>>
>> Recently 10% performance regression on an important benchmark showed up
>> after we integrated https://reviews.llvm.org/rL318299. The analysis
>> showed that rL318299 triggered loop rotation on an multi exits loop,
and
>> the loop rotation introduced code layout issue. The performance
regression
>> is a side-effect of rL318299. I got two testcases a.ll and b.ll
attached to
>> illustrate the problem. a.ll was generated by rL318298 and b.ll was
>> generated by rL318299.
>>
>
> a.ll and b.ll are generated from the same function by different versions
> of compiler. a.s and b.s generated from a.ll and b.ll respectively showed
> where the performance difference comes from.
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171219/bb40f009/attachment.html>

Xinliang David Li via llvm-dev

2017-Dec-19 01:46 UTC

head link

[llvm-dev] A code layout related side-effect introduced by rL318299

The introduction of cleanup.cond block in b.ll without loop-rotation
already makes the layout worse than a.ll.


Without introducing cleanup.cond block, the layout out is

entry->while.cond -> while.body->ret

All the arrows are hot fall through edges which is good.

With cleanup.cond introduced in b.ll, the layout without tailDup nor loop
rotation looks like:


entry->while.cond ->while.body->cleanup.cond,  ret

Note that now there is no fall through edge to 'ret' block and 
while.body
now needs to explicitly branch to 'ret'.   If loop rotation happens, we
will have

entry, cleanup.cond -> while.cond -> while.body->ret

Not that there will be a hot branch from entry to while.cond.

LLVM actually does both tail dup and loop rotation. And the layout looks
like:

entry --> while.cond.dup, cleanup.cond->while.cond ->
while.body->ret

this is better than the previous one, but there is still a hot branch from
while.cond.dup to while.body introduced.

David

On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:
> Hi,
>
> Recently 10% performance regression on an important benchmark showed up
> after we integrated https://reviews.llvm.org/rL318299. The analysis
> showed that rL318299 triggered loop rotation on an multi exits loop, and
> the loop rotation introduced code layout issue. The performance regression
> is a side-effect of rL318299. I got two testcases a.ll and b.ll attached to
> illustrate the problem. a.ll was generated by rL318298 and b.ll was
> generated by rL318299.
>
> -------------------------- a.ll ----------------------------
> declare void @_Z1fv() local_unnamed_addr #2
> @i = global i8 0, align 1
>
> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
> returned %p3) local_unnamed_addr #3 {
> entry:
>   br label %while.cond
>
> while.cond:                                       ; preds = %while.body,
> %entry
>   %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
>   %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
>   %cmp = icmp ugt i8* %h.addr.0, @i
>   br i1 %cmp, label %while.end, label %while.body
>
> while.body:                                       ; preds = %while.cond
>   %0 = bitcast i8* %d.addr.0 to i64*
>   %1 = load i64, i64* %0, align 1
>   %2 = bitcast i8* %h.addr.0 to i64*
>   store i64 %1, i64* %2, align 1
>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
>   %3 = bitcast i8* %add.ptr to i64*
>   %4 = load i64, i64* %3, align 1
>   store i64 %4, i64* %2, align 1
>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>   br i1 %cmp5, label %while.cond, label %return
>
> while.end:                                        ; preds = %while.cond
>   tail call void @_Z1fv()
>   unreachable
>
> return:                                           ; preds = %while.body
>   ret i8* %p3
> }
>
>
> -------------------------- b.ll ----------------------------
> declare void @_Z1fv() local_unnamed_addr #2
> @i = global i8 0, align 1
>
> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
> returned %p3) local_unnamed_addr #3 {
> entry:
>   br label %while.cond
>
> while.cond:                                       ; preds = %cleanup.cont,
> %entry
>   %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ]
>   %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ]
>   %cmp = icmp ugt i8* %h.addr.0, @i
>   br i1 %cmp, label %while.end, label %while.body
>
> while.body:                                       ; preds = %while.cond
>   %0 = bitcast i8* %d.addr.0 to i64*
>   %1 = load i64, i64* %0, align 1
>   %2 = bitcast i8* %h.addr.0 to i64*
>   store i64 %1, i64* %2, align 1
>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
>   %3 = bitcast i8* %add.ptr to i64*
>   %4 = load i64, i64* %3, align 1
>   store i64 %4, i64* %2, align 1
>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>   br i1 %cmp5, label %cleanup.cont, label %return
>
> cleanup.cont:                                     ; preds = %while.body
>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
>   br label %while.cond
>
> while.end:                                        ; preds = %while.cond
>   tail call void @_Z1fv()
>   unreachable
>
> return:                                           ; preds = %while.body
>   ret i8* %p3
> }
>
> The only difference between a.ll and b.ll is the basicblock cleanup.cont.
>
> -------------------------- a.ll after loop rotate, same as a.ll before
> loop rotate ----------------------------
> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
>
> ; ModuleID = '<stdin>'
> source_filename = "<stdin>"
>
> @i = global i8 0, align 1
>
> declare void @_Z1fv() local_unnamed_addr
>
> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
> returned %p3) local_unnamed_addr {
> entry:
>   br label %while.cond
>
> while.cond:                                       ; preds = %while.body,
> %entry
>   %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
>   %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
>   %cmp = icmp ugt i8* %h.addr.0, @i
>   br i1 %cmp, label %while.end, label %while.body
>
> while.body:                                       ; preds = %while.cond
>   %0 = bitcast i8* %d.addr.0 to i64*
>   %1 = load i64, i64* %0, align 1
>   %2 = bitcast i8* %h.addr.0 to i64*
>   store i64 %1, i64* %2, align 1
>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
>   %3 = bitcast i8* %add.ptr to i64*
>   %4 = load i64, i64* %3, align 1
>   store i64 %4, i64* %2, align 1
>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>   br i1 %cmp5, label %while.cond, label %return
>
> while.end:                                        ; preds = %while.cond
>   tail call void @_Z1fv()
>   unreachable
>
> return:                                           ; preds = %while.body
>   ret i8* %p3
> }
>
> -------------------------- b.ll after loop rotate
> ----------------------------
> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
>
> @i = global i8 0, align 1
> declare void @_Z1fv() local_unnamed_addr
>
> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
> returned %p3) local_unnamed_addr {
> entry:
>   %cmp1 = icmp ugt i8* %h, @i
>   br i1 %cmp1, label %while.end, label %while.body.lr.ph
>
> while.body.lr.ph:                                 ; preds = %entry
>   br label %while.body
>
> while.cond:                                       ; preds = %while.body
>   %h.addr.0 = phi i8* [ %add.ptr4, %while.body ]
>   %d.addr.0 = phi i8* [ %add.ptr3, %while.body ]
>   %cmp = icmp ugt i8* %h.addr.0, @i
>   br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body
>
> while.body:                                       ; preds = %
> while.body.lr.ph, %while.cond
>   %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0,
> %while.cond ]
>   %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0,
> %while.cond ]
>   %0 = bitcast i8* %d.addr.03 to i64*
>   %1 = load i64, i64* %0, align 1
>   %2 = bitcast i8* %h.addr.02 to i64*
>   store i64 %1, i64* %2, align 1
>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8
>   %3 = bitcast i8* %add.ptr to i64*
>   %4 = load i64, i64* %3, align 1
>   store i64 %4, i64* %2, align 1
>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6
>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6
>   br i1 %cmp5, label %while.cond, label %return
>
> while.cond.while.end_crit_edge:                   ; preds = %while.cond
>   br label %while.end
>
> while.end:                                        ; preds >
%while.cond.while.end_crit_edge, %entry
>   tail call void @_Z1fv()
>   unreachable
>
> return:                                           ; preds = %while.body
>   ret i8* %p3
> }
>
> a.ll and b.ll have different results after loop rotation because of
> http://llvm.org/viewvc/llvm-project?view=revision&revision=181230
>
<https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg>
>
> -------------------------- a.s generated from a.ll
> ----------------------------
> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
> |~/workarea/llvm-r318298/dbuild/bin/llc
>
> .cfi_startproc
> # BB#0:                                 # %entry
> pushq %rax
> .cfi_def_cfa_offset 16
> movl $i, %eax
> .p2align 4, 0x90
> .LBB0_1:                                # %while.cond
>                                         # =>This Inner Loop Header:
Depth=1
> cmpq %rax, %rsi
> ja .LBB0_4
> # BB#2:                                 # %while.body
>                                         #   in Loop: Header=BB0_1 Depth=1
> movq (%rdi), %rcx
> movq %rcx, (%rsi)
> movq 8(%rdi), %rcx
> movq %rcx, (%rsi)
> addq $6, %rdi
> addq $6, %rsi
> cmpq %rdx, %rsi
> jb .LBB0_1
> # BB#3:                                 # %return
> movq %rdx, %rax
> popq %rcx
> retq
> .LBB0_4:                                # %while.end
> callq _Z1fv
> .Lfunc_end0:
> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
> .cfi_endproc
>
> call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s
> will contain mostly fall through branches.
>
> -------------------------- b.s generated from b.ll
> ----------------------------
> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
> |~/workarea/llvm-r318298/dbuild/bin/llc
>
> .cfi_startproc
> # BB#0:                                 # %entry
> pushq %rax
> .cfi_def_cfa_offset 16
> movl $i, %eax
> cmpq %rax, %rsi
> ja .LBB0_5
> # BB#1:
> movl $i, %eax
> .p2align 4, 0x90
> .LBB0_3:                                # %while.body
>                                         # =>This Inner Loop Header:
Depth=1
> movq (%rdi), %rcx
> movq %rcx, (%rsi)
> movq 8(%rdi), %rcx
> movq %rcx, (%rsi)
> addq $6, %rsi
> cmpq %rdx, %rsi
> jae .LBB0_4
> # BB#2:                                 # %while.cond
>                                         #   in Loop: Header=BB0_3 Depth=1
> addq $6, %rdi
> cmpq %rax, %rsi
> jbe .LBB0_3
> .LBB0_5:                                # %while.end
> callq _Z1fv
> .LBB0_4:                                # %return
> movq %rdx, %rax
> popq %rcx
> retq
> .Lfunc_end0:
> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
> .cfi_endproc
>
> call _Z1fv is unreachable. Here we have "jae .LBB0_4" which will
not fall
> through when exiting the loop. The non-fall-through branch increases branch
> misses significantly and regresses the benchmark by 10%.
>
> Now a possible way to fix it is to duplicate basicblock .LBB0_3, just like
> what tail duplication does, but basicblock .LBB0_3 contains 7 instructions
> and it will introduce some cost of code size increase.
>
> Any suggestion to fix the issue are welcomed!
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171218/63053299/attachment-0001.html>

Chandler Carruth via llvm-dev

2017-Dec-19 02:03 UTC

head link

[llvm-dev] A code layout related side-effect introduced by rL318299

On Mon, Dec 18, 2017 at 5:46 PM Xinliang David Li <davidxl at google.com>
wrote:
> The introduction of cleanup.cond block in b.ll without loop-rotation
> already makes the layout worse than a.ll.
>
>
> Without introducing cleanup.cond block, the layout out is
>
> entry->while.cond -> while.body->ret
>
> All the arrows are hot fall through edges which is good.
>
> With cleanup.cond introduced in b.ll, the layout without tailDup nor loop
> rotation looks like:
>
>
> entry->while.cond ->while.body->cleanup.cond,  ret
>
> Note that now there is no fall through edge to 'ret' block and 
while.body
> now needs to explicitly branch to 'ret'.   If loop rotation
happens, we
> will have
>
> entry, cleanup.cond -> while.cond -> while.body->ret
>
> Not that there will be a hot branch from entry to while.cond.
>
> LLVM actually does both tail dup and loop rotation. And the layout looks
> like:
>
> entry --> while.cond.dup, cleanup.cond->while.cond ->
while.body->ret
>
> this is better than the previous one, but there is still a hot branch from
> while.cond.dup to while.body introduced.
>
(just relaying a comment I made in person)

This does seem like a less good layout, but as a consequence of it we do
avoid one addition and comparison along the hot path. It's not obvious to
me which is better: the better layout (with added instruction) or the
minimal set of instructions with less good layout.

>
> David
> On Mon, Dec 18, 2017 at 4:14 PM, Wei Mi <wmi at google.com> wrote:
>
>> Hi,
>>
>> Recently 10% performance regression on an important benchmark showed up
>> after we integrated https://reviews.llvm.org/rL318299. The analysis
>> showed that rL318299 triggered loop rotation on an multi exits loop,
and
>> the loop rotation introduced code layout issue. The performance
regression
>> is a side-effect of rL318299. I got two testcases a.ll and b.ll
attached to
>> illustrate the problem. a.ll was generated by rL318298 and b.ll was
>> generated by rL318299.
>>
>> -------------------------- a.ll ----------------------------
>> declare void @_Z1fv() local_unnamed_addr #2
>> @i = global i8 0, align 1
>>
>> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
>> returned %p3) local_unnamed_addr #3 {
>> entry:
>>   br label %while.cond
>>
>> while.cond:                                       ; preds =
%while.body,
>> %entry
>>   %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
>>   %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
>>   %cmp = icmp ugt i8* %h.addr.0, @i
>>   br i1 %cmp, label %while.end, label %while.body
>>
>> while.body:                                       ; preds = %while.cond
>>   %0 = bitcast i8* %d.addr.0 to i64*
>>   %1 = load i64, i64* %0, align 1
>>   %2 = bitcast i8* %h.addr.0 to i64*
>>   store i64 %1, i64* %2, align 1
>>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
>>   %3 = bitcast i8* %add.ptr to i64*
>>   %4 = load i64, i64* %3, align 1
>>   store i64 %4, i64* %2, align 1
>>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
>>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
>>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>>   br i1 %cmp5, label %while.cond, label %return
>>
>> while.end:                                        ; preds = %while.cond
>>   tail call void @_Z1fv()
>>   unreachable
>>
>> return:                                           ; preds = %while.body
>>   ret i8* %p3
>> }
>>
>>
>> -------------------------- b.ll ----------------------------
>> declare void @_Z1fv() local_unnamed_addr #2
>> @i = global i8 0, align 1
>>
>> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
>> returned %p3) local_unnamed_addr #3 {
>> entry:
>>   br label %while.cond
>>
>> while.cond:                                       ; preds >>
%cleanup.cont, %entry
>>   %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %cleanup.cont ]
>>   %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %cleanup.cont ]
>>   %cmp = icmp ugt i8* %h.addr.0, @i
>>   br i1 %cmp, label %while.end, label %while.body
>>
>> while.body:                                       ; preds = %while.cond
>>   %0 = bitcast i8* %d.addr.0 to i64*
>>   %1 = load i64, i64* %0, align 1
>>   %2 = bitcast i8* %h.addr.0 to i64*
>>   store i64 %1, i64* %2, align 1
>>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
>>   %3 = bitcast i8* %add.ptr to i64*
>>   %4 = load i64, i64* %3, align 1
>>   store i64 %4, i64* %2, align 1
>>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
>>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>>   br i1 %cmp5, label %cleanup.cont, label %return
>>
>> cleanup.cont:                                     ; preds = %while.body
>>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
>>   br label %while.cond
>>
>> while.end:                                        ; preds = %while.cond
>>   tail call void @_Z1fv()
>>   unreachable
>>
>> return:                                           ; preds = %while.body
>>   ret i8* %p3
>> }
>>
>> The only difference between a.ll and b.ll is the basicblock
cleanup.cont.
>>
>> -------------------------- a.ll after loop rotate, same as a.ll before
>> loop rotate ----------------------------
>> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
>>
>> ; ModuleID = '<stdin>'
>> source_filename = "<stdin>"
>>
>> @i = global i8 0, align 1
>>
>> declare void @_Z1fv() local_unnamed_addr
>>
>> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
>> returned %p3) local_unnamed_addr {
>> entry:
>>   br label %while.cond
>>
>> while.cond:                                       ; preds =
%while.body,
>> %entry
>>   %h.addr.0 = phi i8* [ %h, %entry ], [ %add.ptr4, %while.body ]
>>   %d.addr.0 = phi i8* [ %d, %entry ], [ %add.ptr3, %while.body ]
>>   %cmp = icmp ugt i8* %h.addr.0, @i
>>   br i1 %cmp, label %while.end, label %while.body
>>
>> while.body:                                       ; preds = %while.cond
>>   %0 = bitcast i8* %d.addr.0 to i64*
>>   %1 = load i64, i64* %0, align 1
>>   %2 = bitcast i8* %h.addr.0 to i64*
>>   store i64 %1, i64* %2, align 1
>>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.0, i64 8
>>   %3 = bitcast i8* %add.ptr to i64*
>>   %4 = load i64, i64* %3, align 1
>>   store i64 %4, i64* %2, align 1
>>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.0, i64 6
>>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.0, i64 6
>>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>>   br i1 %cmp5, label %while.cond, label %return
>>
>> while.end:                                        ; preds = %while.cond
>>   tail call void @_Z1fv()
>>   unreachable
>>
>> return:                                           ; preds = %while.body
>>   ret i8* %p3
>> }
>>
>> -------------------------- b.ll after loop rotate
>> ----------------------------
>> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
>>
>> @i = global i8 0, align 1
>> declare void @_Z1fv() local_unnamed_addr
>>
>> define i8* @_Z1gPcS_S_(i8* nocapture readonly %d, i8* %h, i8* readnone
>> returned %p3) local_unnamed_addr {
>> entry:
>>   %cmp1 = icmp ugt i8* %h, @i
>>   br i1 %cmp1, label %while.end, label %while.body.lr.ph
>>
>> while.body.lr.ph:                                 ; preds = %entry
>>   br label %while.body
>>
>> while.cond:                                       ; preds = %while.body
>>   %h.addr.0 = phi i8* [ %add.ptr4, %while.body ]
>>   %d.addr.0 = phi i8* [ %add.ptr3, %while.body ]
>>   %cmp = icmp ugt i8* %h.addr.0, @i
>>   br i1 %cmp, label %while.cond.while.end_crit_edge, label %while.body
>>
>> while.body:                                       ; preds = %
>> while.body.lr.ph, %while.cond
>>   %d.addr.03 = phi i8* [ %d, %while.body.lr.ph ], [ %d.addr.0,
>> %while.cond ]
>>   %h.addr.02 = phi i8* [ %h, %while.body.lr.ph ], [ %h.addr.0,
>> %while.cond ]
>>   %0 = bitcast i8* %d.addr.03 to i64*
>>   %1 = load i64, i64* %0, align 1
>>   %2 = bitcast i8* %h.addr.02 to i64*
>>   store i64 %1, i64* %2, align 1
>>   %add.ptr = getelementptr inbounds i8, i8* %d.addr.03, i64 8
>>   %3 = bitcast i8* %add.ptr to i64*
>>   %4 = load i64, i64* %3, align 1
>>   store i64 %4, i64* %2, align 1
>>   %add.ptr4 = getelementptr inbounds i8, i8* %h.addr.02, i64 6
>>   %cmp5 = icmp ult i8* %add.ptr4, %p3
>>   %add.ptr3 = getelementptr inbounds i8, i8* %d.addr.03, i64 6
>>   br i1 %cmp5, label %while.cond, label %return
>>
>> while.cond.while.end_crit_edge:                   ; preds = %while.cond
>>   br label %while.end
>>
>> while.end:                                        ; preds >>
%while.cond.while.end_crit_edge, %entry
>>   tail call void @_Z1fv()
>>   unreachable
>>
>> return:                                           ; preds = %while.body
>>   ret i8* %p3
>> }
>>
>> a.ll and b.ll have different results after loop rotation because of
>> http://llvm.org/viewvc/llvm-project?view=revision&revision=181230
>>
<https://www.google.com/url?q=http://llvm.org/viewvc/llvm-project?view%3Drevision%26revision%3D181230&sa=D&usg=AFQjCNHIQDnlfGByPF-pvV991MH72_ExNg>
>>
>> -------------------------- a.s generated from a.ll
>> ----------------------------
>> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < a.ll
>> |~/workarea/llvm-r318298/dbuild/bin/llc
>>
>> .cfi_startproc
>> # BB#0:                                 # %entry
>> pushq %rax
>> .cfi_def_cfa_offset 16
>> movl $i, %eax
>> .p2align 4, 0x90
>> .LBB0_1:                                # %while.cond
>>                                         # =>This Inner Loop Header:
>> Depth=1
>> cmpq %rax, %rsi
>> ja .LBB0_4
>> # BB#2:                                 # %while.body
>>                                         #   in Loop: Header=BB0_1
Depth=1
>> movq (%rdi), %rcx
>> movq %rcx, (%rsi)
>> movq 8(%rdi), %rcx
>> movq %rcx, (%rsi)
>> addq $6, %rdi
>> addq $6, %rsi
>> cmpq %rdx, %rsi
>> jb .LBB0_1
>> # BB#3:                                 # %return
>> movq %rdx, %rax
>> popq %rcx
>> retq
>> .LBB0_4:                                # %while.end
>> callq _Z1fv
>> .Lfunc_end0:
>> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
>> .cfi_endproc
>>
>> call _Z1fv is unreachable. Suppose loop LBB0_1 has few iterations, a.s
>> will contain mostly fall through branches.
>>
>> -------------------------- b.s generated from b.ll
>> ----------------------------
>> ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
>> |~/workarea/llvm-r318298/dbuild/bin/llc
>>
>> .cfi_startproc
>> # BB#0:                                 # %entry
>> pushq %rax
>> .cfi_def_cfa_offset 16
>> movl $i, %eax
>> cmpq %rax, %rsi
>> ja .LBB0_5
>> # BB#1:
>> movl $i, %eax
>> .p2align 4, 0x90
>> .LBB0_3:                                # %while.body
>>                                         # =>This Inner Loop Header:
>> Depth=1
>> movq (%rdi), %rcx
>> movq %rcx, (%rsi)
>> movq 8(%rdi), %rcx
>> movq %rcx, (%rsi)
>> addq $6, %rsi
>> cmpq %rdx, %rsi
>> jae .LBB0_4
>> # BB#2:                                 # %while.cond
>>                                         #   in Loop: Header=BB0_3
Depth=1
>> addq $6, %rdi
>> cmpq %rax, %rsi
>> jbe .LBB0_3
>> .LBB0_5:                                # %while.end
>> callq _Z1fv
>> .LBB0_4:                                # %return
>> movq %rdx, %rax
>> popq %rcx
>> retq
>> .Lfunc_end0:
>> .size _Z1gPcS_S_, .Lfunc_end0-_Z1gPcS_S_
>> .cfi_endproc
>>
>> call _Z1fv is unreachable. Here we have "jae .LBB0_4" which
will not
>> fall through when exiting the loop. The non-fall-through branch
increases
>> branch misses significantly and regresses the benchmark by 10%.
>>
>> Now a possible way to fix it is to duplicate basicblock .LBB0_3, just
>> like what tail duplication does, but basicblock .LBB0_3 contains 7
>> instructions and it will introduce some cost of code size increase.
>>
>> Any suggestion to fix the issue are welcomed!
>>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171219/83d74bb9/attachment.html>

Maybe Matching Threads

Search for more seemingly similar threads

llvm dev - Dec 2017 - A code layout related side-effect introduced by rL318299

[llvm-dev] A code layout related side-effect introduced by rL318299

[llvm-dev] A code layout related side-effect introduced by rL318299

[llvm-dev] A code layout related side-effect introduced by rL318299

[llvm-dev] A code layout related side-effect introduced by rL318299

[llvm-dev] A code layout related side-effect introduced by rL318299

Maybe Matching Threads