thr3ads.net - llvm dev - [llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure [Oct 2017]

If this information is useful, please help other people find it:
Share via:

Stefan Pintilie via llvm-dev

2017-Oct-13 20:09 UTC

[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure

Hi, 

I've been looking at the Machine Scheduler on Power PC.  I am looking only 
at the pre-RA machine scheduler and I am running it in the default 
bi-directional mode (so, both top down and bottom up queues are 
considered). I've come across an example where the scheduler picks a poor 
ordering for the instructions which results in very high register pressure 
which results in spills. The problem comes from the fact that the Machine 
Scheduler uses a maximum latency limit when it considers instructions to 
schedule. A high latency instruction will not be scheduled before all of 
the available lower latency instructions are scheduled. This happens 
regardless of register pressure since the higher latency instruction is 
not even added to the "Available" queue that is used when the
heuristics
pick an instruction to schedule next. 

My question is: Why do we have that latency limit in the first place? If 
an instruction can be scheduled (ie all the instructions it depends on are 
already scheduled) shouldn't it be at least considered?



The example is listed below:

test.c
--
long A[100];
long func(long* num, long* den) {
// This loop is unrolled
for (int i=0; i<6; i++) {
  A[i] = num[i] / den[i];
}
return 0;
}
--

Compile commands
--
clang -c -m64 -O3 -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 
-fexperimental-new-pass-manager test.c -S -emit-llvm
llc test.ll -O3 -ppc-asm-full-reg-names -debug-only=machine-scheduler -o 
test-p9.s > listing.out 2>&1
--

Looking at the listing.out file I've noticed that all of the loads are 
grouped together at the start of the function. Those loads use 12 
registers before any of the divides are scheduled. As a result, we end up 
with significantly higher register pressure after all the loads.
--
0B      BB#0: derived from LLVM BB %entry
            Live Ins: %X3 %X4
16B             %vreg1<def> = COPY %X4; G8RC_and_G8RC_NOX0:%vreg1
32B             %vreg0<def> = COPY %X3; G8RC_and_G8RC_NOX0:%vreg0
48B             %vreg2<def> = LD 0, %vreg0; mem:LD8[%num](tbaa=!4) 
G8RC:%vreg2 G8RC_and_G8RC_NOX0:%vreg0
64B             %vreg3<def> = LD 0, %vreg1; mem:LD8[%den](tbaa=!4) 
G8RC:%vreg3 G8RC_and_G8RC_NOX0:%vreg1
144B            %vreg7<def> = LD 8, %vreg0; mem:LD8[%arrayidx.1](tbaa=!4) 
G8RC:%vreg7 G8RC_and_G8RC_NOX0:%vreg0
160B            %vreg8<def> = LD 8, %vreg1; mem:LD8[%arrayidx2.1](tbaa=!4)
G8RC:%vreg8 G8RC_and_G8RC_NOX0:%vreg1
208B            %vreg10<def> = LD 16, %vreg0; 
mem:LD8[%arrayidx.2](tbaa=!4) G8RC:%vreg10 G8RC_and_G8RC_NOX0:%vreg0
224B            %vreg11<def> = LD 16, %vreg1; 
mem:LD8[%arrayidx2.2](tbaa=!4) G8RC:%vreg11 G8RC_and_G8RC_NOX0:%vreg1
272B            %vreg13<def> = LD 24, %vreg0; 
mem:LD8[%arrayidx.3](tbaa=!4) G8RC:%vreg13 G8RC_and_G8RC_NOX0:%vreg0
288B            %vreg14<def> = LD 24, %vreg1; 
mem:LD8[%arrayidx2.3](tbaa=!4) G8RC:%vreg14 G8RC_and_G8RC_NOX0:%vreg1
336B            %vreg16<def> = LD 32, %vreg0; 
mem:LD8[%arrayidx.4](tbaa=!4) G8RC:%vreg16 G8RC_and_G8RC_NOX0:%vreg0
352B            %vreg17<def> = LD 32, %vreg1; 
mem:LD8[%arrayidx2.4](tbaa=!4) G8RC:%vreg17 G8RC_and_G8RC_NOX0:%vreg1
400B            %vreg19<def> = LD 40, %vreg0; 
mem:LD8[%arrayidx.5](tbaa=!4) G8RC:%vreg19 G8RC_and_G8RC_NOX0:%vreg0
416B            %vreg20<def> = LD 40, %vreg1; 
mem:LD8[%arrayidx2.5](tbaa=!4) G8RC:%vreg20 G8RC_and_G8RC_NOX0:%vreg1
424B            %vreg4<def> = DIVD %vreg2, %vreg3; 
G8RC:%vreg4,%vreg2,%vreg3
432B            %vreg9<def> = DIVD %vreg7, %vreg8; 
G8RC:%vreg9,%vreg7,%vreg8
440B            %vreg12<def> = DIVD %vreg10, %vreg11; 
G8RC:%vreg12,%vreg10,%vreg11
448B            %vreg15<def> = DIVD %vreg13, %vreg14; 
G8RC:%vreg15,%vreg13,%vreg14
456B            %vreg18<def> = DIVD %vreg16, %vreg17; 
G8RC:%vreg18,%vreg16,%vreg17
464B            %vreg21<def> = DIVD %vreg19, %vreg20; 
G8RC:%vreg21,%vreg19,%vreg20
472B            %vreg5<def> = ADDIStocHA %X2, <ga:@A>; 
G8RC_and_G8RC_NOX0:%vreg5
480B            %vreg6<def> = LDtocL <ga:@A>, %vreg5,
%X2<imp-use>;
mem:LD8[GOT] G8RC_and_G8RC_NOX0:%vreg6,%vreg5
504B            %X3<def> = LI8 0
512B            STD %vreg4, 0, %vreg6; mem:ST8[getelementptr inbounds 
([100 x i64], [100 x i64]* @A, i64 0, i64 0)](tbaa=!4) G8RC:%vreg4 
G8RC_and_G8RC_NOX0:%vreg6
520B            STD %vreg9, 8, %vreg6; mem:ST8[getelementptr inbounds 
([100 x i64], [100 x i64]* @A, i64 0, i64 1)](tbaa=!4) G8RC:%vreg9 
G8RC_and_G8RC_NOX0:%vreg6
528B            STD %vreg12, 16, %vreg6; mem:ST8[getelementptr inbounds 
([100 x i64], [100 x i64]* @A, i64 0, i64 2)](tbaa=!4) G8RC:%vreg12 
G8RC_and_G8RC_NOX0:%vreg6
536B            STD %vreg15, 24, %vreg6; mem:ST8[getelementptr inbounds 
([100 x i64], [100 x i64]* @A, i64 0, i64 3)](tbaa=!4) G8RC:%vreg15 
G8RC_and_G8RC_NOX0:%vreg6
544B            STD %vreg18, 32, %vreg6; mem:ST8[getelementptr inbounds 
([100 x i64], [100 x i64]* @A, i64 0, i64 4)](tbaa=!4) G8RC:%vreg18 
G8RC_and_G8RC_NOX0:%vreg6
552B            STD %vreg21, 40, %vreg6; mem:ST8[getelementptr inbounds 
([100 x i64], [100 x i64]* @A, i64 0, i64 5)](tbaa=!4) G8RC:%vreg21 
G8RC_and_G8RC_NOX0:%vreg6
560B            BLR8 %LR8<imp-use>, %RM<imp-use>, %X3<imp-use>
--

Due to all of the register pressure built up by those loads we are forced 
to spill. Here is the final assembly.
--
# BB#0:                                 # %entry
        std r30, -16(r1)                # 8-byte Folded Spill
        ld r5, 0(r3)
        ld r6, 0(r4)
        ld r7, 8(r3)
        ld r8, 8(r4)
        ld r9, 16(r3)
        ld r10, 16(r4)
        ld r11, 24(r3)
        ld r0, 32(r3)
        ld r12, 24(r4)
        ld r30, 32(r4)
        ld r3, 40(r3)
        ld r4, 40(r4)
        divd r5, r5, r6
        divd r6, r7, r8
        divd r7, r9, r10
        divd r9, r0, r30
        divd r4, r3, r4
        divd r8, r11, r12
        addis r3, r2, .LC0 at toc@ha
        ld r30, -16(r1)                 # 8-byte Folded Reload
        ld r10, .LC0 at toc@l(r3)
        li r3, 0
        std r5, 0(r10)
        std r6, 8(r10)
        std r7, 16(r10)
        std r9, 32(r10)
        std r8, 24(r10)
        std r4, 40(r10)
        blr
--

Thank you, 
Stefan Pintilie

-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171013/72eaf14d/attachment.html>

Matthias Braun via llvm-dev

2017-Oct-13 20:46 UTC

head link

[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure

Yes, I've run into the problem myself that the Pending queue isn't even
checked with the tryCandidate() logic and so takes priority over all other
scheduling decisions.

I personally would be open to changes in this area. To start the brainstorming I
could imagine that we move nodes below a target specific limit into the
available queue instead of just when they hit their latency cycle limits. And
then let tryCandidate() weight the remaining cycles against other scheduling
criteria.

Also keep in mind:
- When making those changes be careful getting cycle bumping/simulation logic
right
- The pending queue is also used as a mechanism to keep compile time (see
ReadyListLimit) in check (as checking every candidate for every instruction we
schedule is O(n**2)).

- Matthias
> On Oct 13, 2017, at 1:09 PM, Stefan Pintilie via llvm-dev <llvm-dev at
lists.llvm.org> wrote:
> 
> Hi, 
> 
> I've been looking at the Machine Scheduler on Power PC.  I am looking
only at the pre-RA machine scheduler and I am running it in the default
bi-directional mode (so, both top down and bottom up queues are considered).
I've come across an example where the scheduler picks a poor ordering for
the instructions which results in very high register pressure which results in
spills. The problem comes from the fact that the Machine Scheduler uses a
maximum latency limit when it considers instructions to schedule. A high latency
instruction will not be scheduled before all of the available lower latency
instructions are scheduled. This happens regardless of register pressure since
the higher latency instruction is not even added to the "Available"
queue that is used when the heuristics pick an instruction to schedule next.
> 
> My question is: Why do we have that latency limit in the first place? If an
instruction can be scheduled (ie all the instructions it depends on are already
scheduled) shouldn't it be at least considered?
> 
> 
> 
> The example is listed below:
> 
> test.c
> --
> long A[100];
> long func(long* num, long* den) {
> // This loop is unrolled
> for (int i=0; i<6; i++) {
>   A[i] = num[i] / den[i];
> }
> return 0;
> }
> --
> 
> Compile commands
> --
> clang -c -m64 -O3 -target powerpc64le-unknown-linux-gnu -mcpu=pwr9
-fexperimental-new-pass-manager test.c -S -emit-llvm
> llc test.ll -O3 -ppc-asm-full-reg-names -debug-only=machine-scheduler -o
test-p9.s > listing.out 2>&1
> --
> 
> Looking at the listing.out file I've noticed that all of the loads are
grouped together at the start of the function. Those loads use 12 registers
before any of the divides are scheduled. As a result, we end up with
significantly higher register pressure after all the loads.
> --
> 0B      BB#0: derived from LLVM BB %entry
>             Live Ins: %X3 %X4
> 16B             %vreg1<def> = COPY %X4; G8RC_and_G8RC_NOX0:%vreg1
> 32B             %vreg0<def> = COPY %X3; G8RC_and_G8RC_NOX0:%vreg0
> 48B             %vreg2<def> = LD 0, %vreg0; mem:LD8[%num](tbaa=!4)
G8RC:%vreg2 G8RC_and_G8RC_NOX0:%vreg0
> 64B             %vreg3<def> = LD 0, %vreg1; mem:LD8[%den](tbaa=!4)
G8RC:%vreg3 G8RC_and_G8RC_NOX0:%vreg1
> 144B            %vreg7<def> = LD 8, %vreg0;
mem:LD8[%arrayidx.1](tbaa=!4) G8RC:%vreg7 G8RC_and_G8RC_NOX0:%vreg0
> 160B            %vreg8<def> = LD 8, %vreg1;
mem:LD8[%arrayidx2.1](tbaa=!4) G8RC:%vreg8 G8RC_and_G8RC_NOX0:%vreg1
> 208B            %vreg10<def> = LD 16, %vreg0;
mem:LD8[%arrayidx.2](tbaa=!4) G8RC:%vreg10 G8RC_and_G8RC_NOX0:%vreg0
> 224B            %vreg11<def> = LD 16, %vreg1;
mem:LD8[%arrayidx2.2](tbaa=!4) G8RC:%vreg11 G8RC_and_G8RC_NOX0:%vreg1
> 272B            %vreg13<def> = LD 24, %vreg0;
mem:LD8[%arrayidx.3](tbaa=!4) G8RC:%vreg13 G8RC_and_G8RC_NOX0:%vreg0
> 288B            %vreg14<def> = LD 24, %vreg1;
mem:LD8[%arrayidx2.3](tbaa=!4) G8RC:%vreg14 G8RC_and_G8RC_NOX0:%vreg1
> 336B            %vreg16<def> = LD 32, %vreg0;
mem:LD8[%arrayidx.4](tbaa=!4) G8RC:%vreg16 G8RC_and_G8RC_NOX0:%vreg0
> 352B            %vreg17<def> = LD 32, %vreg1;
mem:LD8[%arrayidx2.4](tbaa=!4) G8RC:%vreg17 G8RC_and_G8RC_NOX0:%vreg1
> 400B            %vreg19<def> = LD 40, %vreg0;
mem:LD8[%arrayidx.5](tbaa=!4) G8RC:%vreg19 G8RC_and_G8RC_NOX0:%vreg0
> 416B            %vreg20<def> = LD 40, %vreg1;
mem:LD8[%arrayidx2.5](tbaa=!4) G8RC:%vreg20 G8RC_and_G8RC_NOX0:%vreg1
> 424B            %vreg4<def> = DIVD %vreg2, %vreg3;
G8RC:%vreg4,%vreg2,%vreg3
> 432B            %vreg9<def> = DIVD %vreg7, %vreg8;
G8RC:%vreg9,%vreg7,%vreg8
> 440B            %vreg12<def> = DIVD %vreg10, %vreg11;
G8RC:%vreg12,%vreg10,%vreg11
> 448B            %vreg15<def> = DIVD %vreg13, %vreg14;
G8RC:%vreg15,%vreg13,%vreg14
> 456B            %vreg18<def> = DIVD %vreg16, %vreg17;
G8RC:%vreg18,%vreg16,%vreg17
> 464B            %vreg21<def> = DIVD %vreg19, %vreg20;
G8RC:%vreg21,%vreg19,%vreg20
> 472B            %vreg5<def> = ADDIStocHA %X2, <ga:@A>;
G8RC_and_G8RC_NOX0:%vreg5
> 480B            %vreg6<def> = LDtocL <ga:@A>, %vreg5,
%X2<imp-use>; mem:LD8[GOT] G8RC_and_G8RC_NOX0:%vreg6,%vreg5
> 504B            %X3<def> = LI8 0
> 512B            STD %vreg4, 0, %vreg6; mem:ST8[getelementptr inbounds ([100
x i64], [100 x i64]* @A, i64 0, i64 0)](tbaa=!4) G8RC:%vreg4
G8RC_and_G8RC_NOX0:%vreg6
> 520B            STD %vreg9, 8, %vreg6; mem:ST8[getelementptr inbounds ([100
x i64], [100 x i64]* @A, i64 0, i64 1)](tbaa=!4) G8RC:%vreg9
G8RC_and_G8RC_NOX0:%vreg6
> 528B            STD %vreg12, 16, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 2)](tbaa=!4) G8RC:%vreg12
G8RC_and_G8RC_NOX0:%vreg6
> 536B            STD %vreg15, 24, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 3)](tbaa=!4) G8RC:%vreg15
G8RC_and_G8RC_NOX0:%vreg6
> 544B            STD %vreg18, 32, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 4)](tbaa=!4) G8RC:%vreg18
G8RC_and_G8RC_NOX0:%vreg6
> 552B            STD %vreg21, 40, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 5)](tbaa=!4) G8RC:%vreg21
G8RC_and_G8RC_NOX0:%vreg6
> 560B            BLR8 %LR8<imp-use>, %RM<imp-use>,
%X3<imp-use>
> --
> 
> Due to all of the register pressure built up by those loads we are forced
to spill. Here is the final assembly.
> --
> # BB#0:                                 # %entry
>         std r30, -16(r1)                # 8-byte Folded Spill
>         ld r5, 0(r3)
>         ld r6, 0(r4)
>         ld r7, 8(r3)
>         ld r8, 8(r4)
>         ld r9, 16(r3)
>         ld r10, 16(r4)
>         ld r11, 24(r3)
>         ld r0, 32(r3)
>         ld r12, 24(r4)
>         ld r30, 32(r4)
>         ld r3, 40(r3)
>         ld r4, 40(r4)
>         divd r5, r5, r6
>         divd r6, r7, r8
>         divd r7, r9, r10
>         divd r9, r0, r30
>         divd r4, r3, r4
>         divd r8, r11, r12
>         addis r3, r2, .LC0 at toc@ha
>         ld r30, -16(r1)                 # 8-byte Folded Reload
>         ld r10, .LC0 at toc@l(r3)
>         li r3, 0
>         std r5, 0(r10)
>         std r6, 8(r10)
>         std r7, 16(r10)
>         std r9, 32(r10)
>         std r8, 24(r10)
>         std r4, 40(r10)
>         blr
> --
> 
> Thank you, 
> Stefan Pintilie
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171013/b2a9f65c/attachment.html>

Andrew Trick via llvm-dev

2017-Oct-13 22:01 UTC

head link

[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure

> On Oct 13, 2017, at 1:46 PM, Matthias Braun <matze at braunis.de>
wrote:
> 
> Yes, I've run into the problem myself that the Pending queue isn't
even checked with the tryCandidate() logic and so takes priority over all other
scheduling decisions.
> 
> I personally would be open to changes in this area. To start the
brainstorming I could imagine that we move nodes below a target specific limit
into the available queue instead of just when they hit their latency cycle
limits. And then let tryCandidate() weight the remaining cycles against other
scheduling criteria.
> 
> Also keep in mind:
> - When making those changes be careful getting cycle bumping/simulation
logic right
> - The pending queue is also used as a mechanism to keep compile time (see
ReadyListLimit) in check (as checking every candidate for every instruction we
schedule is O(n**2)).
> 
> - Matthias
No, it doesn’t make any sense to schedule something from the pending queue if
there are still instructions that can be scheduled in the current cycle. Those
available instructions can effectively be scheduled “for free”. Scheduling them
first won’t delay anything in the pending queue!

If that’s not the case then there’s something wrong with your model. Or if this
kind of VLIW-style scheduling isn’t what you wanted, then you shouldn’t have
ordered it:

Set MicroOpBufferSize=1 instead:

  // MicroOpBufferSize is the number of micro-ops that the processor may buffer
  // for out-of-order execution.
  //
  // "0" means operations that are not ready in this cycle are not
considered
  // for scheduling (they go in the pending queue). Latency is paramount. This
  // may be more efficient if many instructions are pending in a schedule.
  //
  // "1" means all instructions are considered for scheduling
regardless of
  // whether they are ready in this cycle. Latency still causes issue stalls,
  // but we balance those stalls against other heuristics.
  //
  // "> 1" means the processor is out-of-order. This is a machine
independent
  // estimate of highly machine specific characteristics such as the register
  // renaming pool and reorder buffer.
  unsigned MicroOpBufferSize;
  static const unsigned DefaultMicroOpBufferSize = 0;
>> On Oct 13, 2017, at 1:09 PM, Stefan Pintilie via llvm-dev <llvm-dev
at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>> wrote:
>> 
>> Hi, 
>> 
>> I've been looking at the Machine Scheduler on Power PC.  I am
looking only at the pre-RA machine scheduler and I am running it in the default
bi-directional mode (so, both top down and bottom up queues are considered).
I've come across an example where the scheduler picks a poor ordering for
the instructions which results in very high register pressure which results in
spills. The problem comes from the fact that the Machine Scheduler uses a
maximum latency limit when it considers instructions to schedule. A high latency
instruction will not be scheduled before all of the available lower latency
instructions are scheduled. This happens regardless of register pressure since
the higher latency instruction is not even added to the "Available"
queue that is used when the heuristics pick an instruction to schedule next.
>> 
>> My question is: Why do we have that latency limit in the first place?
If an instruction can be scheduled (ie all the instructions it depends on are
already scheduled) shouldn't it be at least considered?
>> 
>> 
>> 
>> The example is listed below:
>> 
>> test.c
>> --
>> long A[100];
>> long func(long* num, long* den) {
>> // This loop is unrolled
>> for (int i=0; i<6; i++) {
>>   A[i] = num[i] / den[i];
>> }
>> return 0;
>> }
>> --
>> 
>> Compile commands
>> --
>> clang -c -m64 -O3 -target powerpc64le-unknown-linux-gnu -mcpu=pwr9
-fexperimental-new-pass-manager test.c -S -emit-llvm
>> llc test.ll -O3 -ppc-asm-full-reg-names -debug-only=machine-scheduler
-o test-p9.s > listing.out 2>&1
>> --
>> 
>> Looking at the listing.out file I've noticed that all of the loads
are grouped together at the start of the function. Those loads use 12 registers
before any of the divides are scheduled. As a result, we end up with
significantly higher register pressure after all the loads.
>> --
>> 0B      BB#0: derived from LLVM BB %entry
>>             Live Ins: %X3 %X4
>> 16B             %vreg1<def> = COPY %X4; G8RC_and_G8RC_NOX0:%vreg1
>> 32B             %vreg0<def> = COPY %X3; G8RC_and_G8RC_NOX0:%vreg0
>> 48B             %vreg2<def> = LD 0, %vreg0;
mem:LD8[%num](tbaa=!4) G8RC:%vreg2 G8RC_and_G8RC_NOX0:%vreg0
>> 64B             %vreg3<def> = LD 0, %vreg1;
mem:LD8[%den](tbaa=!4) G8RC:%vreg3 G8RC_and_G8RC_NOX0:%vreg1
>> 144B            %vreg7<def> = LD 8, %vreg0;
mem:LD8[%arrayidx.1](tbaa=!4) G8RC:%vreg7 G8RC_and_G8RC_NOX0:%vreg0
>> 160B            %vreg8<def> = LD 8, %vreg1;
mem:LD8[%arrayidx2.1](tbaa=!4) G8RC:%vreg8 G8RC_and_G8RC_NOX0:%vreg1
>> 208B            %vreg10<def> = LD 16, %vreg0;
mem:LD8[%arrayidx.2](tbaa=!4) G8RC:%vreg10 G8RC_and_G8RC_NOX0:%vreg0
>> 224B            %vreg11<def> = LD 16, %vreg1;
mem:LD8[%arrayidx2.2](tbaa=!4) G8RC:%vreg11 G8RC_and_G8RC_NOX0:%vreg1
>> 272B            %vreg13<def> = LD 24, %vreg0;
mem:LD8[%arrayidx.3](tbaa=!4) G8RC:%vreg13 G8RC_and_G8RC_NOX0:%vreg0
>> 288B            %vreg14<def> = LD 24, %vreg1;
mem:LD8[%arrayidx2.3](tbaa=!4) G8RC:%vreg14 G8RC_and_G8RC_NOX0:%vreg1
>> 336B            %vreg16<def> = LD 32, %vreg0;
mem:LD8[%arrayidx.4](tbaa=!4) G8RC:%vreg16 G8RC_and_G8RC_NOX0:%vreg0
>> 352B            %vreg17<def> = LD 32, %vreg1;
mem:LD8[%arrayidx2.4](tbaa=!4) G8RC:%vreg17 G8RC_and_G8RC_NOX0:%vreg1
>> 400B            %vreg19<def> = LD 40, %vreg0;
mem:LD8[%arrayidx.5](tbaa=!4) G8RC:%vreg19 G8RC_and_G8RC_NOX0:%vreg0
>> 416B            %vreg20<def> = LD 40, %vreg1;
mem:LD8[%arrayidx2.5](tbaa=!4) G8RC:%vreg20 G8RC_and_G8RC_NOX0:%vreg1
>> 424B            %vreg4<def> = DIVD %vreg2, %vreg3;
G8RC:%vreg4,%vreg2,%vreg3
>> 432B            %vreg9<def> = DIVD %vreg7, %vreg8;
G8RC:%vreg9,%vreg7,%vreg8
>> 440B            %vreg12<def> = DIVD %vreg10, %vreg11;
G8RC:%vreg12,%vreg10,%vreg11
>> 448B            %vreg15<def> = DIVD %vreg13, %vreg14;
G8RC:%vreg15,%vreg13,%vreg14
>> 456B            %vreg18<def> = DIVD %vreg16, %vreg17;
G8RC:%vreg18,%vreg16,%vreg17
>> 464B            %vreg21<def> = DIVD %vreg19, %vreg20;
G8RC:%vreg21,%vreg19,%vreg20
>> 472B            %vreg5<def> = ADDIStocHA %X2, <ga:@A>;
G8RC_and_G8RC_NOX0:%vreg5
>> 480B            %vreg6<def> = LDtocL <ga:@A>, %vreg5,
%X2<imp-use>; mem:LD8[GOT] G8RC_and_G8RC_NOX0:%vreg6,%vreg5
>> 504B            %X3<def> = LI8 0
>> 512B            STD %vreg4, 0, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 0)](tbaa=!4) G8RC:%vreg4
G8RC_and_G8RC_NOX0:%vreg6
>> 520B            STD %vreg9, 8, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 1)](tbaa=!4) G8RC:%vreg9
G8RC_and_G8RC_NOX0:%vreg6
>> 528B            STD %vreg12, 16, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 2)](tbaa=!4) G8RC:%vreg12
G8RC_and_G8RC_NOX0:%vreg6
>> 536B            STD %vreg15, 24, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 3)](tbaa=!4) G8RC:%vreg15
G8RC_and_G8RC_NOX0:%vreg6
>> 544B            STD %vreg18, 32, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 4)](tbaa=!4) G8RC:%vreg18
G8RC_and_G8RC_NOX0:%vreg6
>> 552B            STD %vreg21, 40, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 5)](tbaa=!4) G8RC:%vreg21
G8RC_and_G8RC_NOX0:%vreg6
>> 560B            BLR8 %LR8<imp-use>, %RM<imp-use>,
%X3<imp-use>
>> --
>> 
>> Due to all of the register pressure built up by those loads we are
forced to spill. Here is the final assembly.
>> --
>> # BB#0:                                 # %entry
>>         std r30, -16(r1)                # 8-byte Folded Spill
>>         ld r5, 0(r3)
>>         ld r6, 0(r4)
>>         ld r7, 8(r3)
>>         ld r8, 8(r4)
>>         ld r9, 16(r3)
>>         ld r10, 16(r4)
>>         ld r11, 24(r3)
>>         ld r0, 32(r3)
>>         ld r12, 24(r4)
>>         ld r30, 32(r4)
>>         ld r3, 40(r3)
>>         ld r4, 40(r4)
>>         divd r5, r5, r6
>>         divd r6, r7, r8
>>         divd r7, r9, r10
>>         divd r9, r0, r30
>>         divd r4, r3, r4
>>         divd r8, r11, r12
>>         addis r3, r2, .LC0 at toc@ha
>>         ld r30, -16(r1)                 # 8-byte Folded Reload
>>         ld r10, .LC0 at toc@l(r3)
>>         li r3, 0
>>         std r5, 0(r10)
>>         std r6, 8(r10)
>>         std r7, 16(r10)
>>         std r9, 32(r10)
>>         std r8, 24(r10)
>>         std r4, 40(r10)
>>         blr
>> --
>> 
>> Thank you, 
>> Stefan Pintilie
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
> 
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20171013/7fd93f19/attachment.html>

Maybe Matching Threads

Search for more seemingly similar threads

llvm dev - Oct 2017 - Machine Scheduler on Power PC: Latency Limit and Register Pressure

[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure

[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure

[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure

Maybe Matching Threads