Hello,
I was wondering if we had a way to strength reduce getelementptr today.
For the following code:
void func(float (&A)[132][140]) {> for (int i = 0; i < 132; ++i)
> for (int j = 0; j < 140; ++j)
> A[i][j] += 1;
> }
We will generate: (annotated with SCEV)
define void @func([132 x [140 x float]]* nocapture dereferenceable(73920))
local_unnamed_addr #0 {
br label %2
; <label>:2: ; preds = %11, %1
; loop<%2> at depth 1 with exact backedge-taken count of 131
; %12 = {1,+,1}<nuw><nsw><%2>
; %3 = {0,+,1}<nuw><nsw><%2>
%3 = phi i64 [ 0, %1 ], [ %12, %11 ]
br label %4
; <label>:4: ; preds = %4, %2
; loop<%4> at depth 2 with exact backedge-taken count of 139
; %5 = {0,+,1}<nuw><nsw><%4>
%5 = phi i64 [ 0, %2 ], [ %9, %4 ]
; %3 = {0,+,1}<nuw><nsw><%2>
; %6 = {{%0,+,560}<nsw><%2>,+,4}<nw><%4>
%6 = getelementptr inbounds [132 x [140 x float]], [132 x [140 x float]]*
%0, i64 0, *i64 %3, i64 %5*
%7 = load float, float* %6, align 4, !tbaa !2
%8 = fadd float %7, 1.000000e+00
store float %8, float* %6, align 4, !tbaa !2
; %9 = {1,+,1}<nuw><nsw><%4>
%9 = add nuw nsw i64 %5, 1
%10 = icmp eq i64 %9, 140
br i1 %10, label %11, label %4
; <label>:11: ; preds = %4
; loop<%2> at depth 1 with exact backedge-taken count of 131
; %3 = {0,+,1}<nuw><nsw><%2>
; %12 = {1,+,1}<nuw><nsw><%2>
%12 = add nuw nsw i64 %3, 1
%13 = icmp eq i64 %12, 132
br i1 %13, label %14, label %2
; <label>:14: ; preds = %11
ret void
}
And we see clearly that the gep is "complicated" in the sense that it
does
two addition and two multiplication.
Instead we could produce the following code:
define void @func([132 x [140 x float]]* nocapture dereferenceable(73920))
local_unnamed_addr #0 {
br label %2
; <label>:2: ; preds = %12, %1
; loop<%2> at depth 1 with exact backedge-taken count of 131
; %13 = {1,+,1}<nuw><nsw><%2>
; %3 = {0,+,1}<nuw><nsw><%2>
%3 = phi i64 [ 0, %1 ], [ %13, %12 ]
; %4 = {%0,+,560}<nsw><%2>
%4 = getelementptr inbounds [132 x [140 x float]], [132 x [140 x float]]*
%0, i64 0, *i64 %3*
br label %5
; <label>:5: ; preds = %5, %2
; loop<%5> at depth 2 with exact backedge-taken count of 139
; %6 = {0,+,1}<nuw><nsw><%5>
%6 = phi i64 [ 0, %2 ], [ %10, %5 ]
; %4 = {%0,+,560}<nsw><%2>
; %7 = {{%0,+,560}<nsw><%2>,+,4}<nsw><%5>
%7 = getelementptr inbounds [140 x float], [140 x float]* %4, i64 0, *i64
%6*
%8 = load float, float* %7, align 4, !tbaa !2
%9 = fadd float %8, 1.000000e+00
store float %9, float* %7, align 4, !tbaa !2
; %10 = {1,+,1}<nuw><nsw><%5>
%10 = add nuw nsw i64 %6, 1
%11 = icmp eq i64 %10, 140
br i1 %11, label %12, label %5
; <label>:12: ; preds = %5
; loop<%2> at depth 1 with exact backedge-taken count of 131
; %3 = {0,+,1}<nuw><nsw><%2>
; %13 = {1,+,1}<nuw><nsw><%2>
%13 = add nuw nsw i64 %3, 1
%14 = icmp eq i64 %13, 132
br i1 %14, label %15, label %2
; <label>:15: ; preds = %12
ret void
}
Where the gep has been strength reduced to one addition one multiplication.
Or even better (at least on the architecture I'm targeting):
define void @func([132 x [140 x float]]* nocapture dereferenceable(73920))
local_unnamed_addr #0 {
; %2 = %0
%2 = getelementptr inbounds [132 x [140 x float]], [132 x [140 x float]]*
%0, i64 0, i64 0
; %3 = (73920 + %0)<nsw>
%3 = getelementptr inbounds [132 x [140 x float]], [132 x [140 x float]]*
%0, i64 1, i64 0
br label %4
; <label>:4: ; preds = %14, %1
; loop<%4> at depth 1 with exact backedge-taken count of 131
; %2 = %0
; %15 = {(560 + %0)<nsw>,+,560}<nuw><%4>
; %5 = {%0,+,560}<nuw><%4>
%5 = phi [140 x float]* [ %2, %1 ], [ %15, %14 ]
; %6 = {%0,+,560}<nuw><%4>
%6 = getelementptr inbounds [140 x float], [140 x float]* %5, i64 0, i64 0
; %7 = {(560 + %0)<nsw>,+,560}<nuw><%4>
%7 = getelementptr inbounds [140 x float], [140 x float]* %5, i64 1, i64 0
br label %8
; <label>:8: ; preds = %8, %4
; loop<%8> at depth 2 with exact backedge-taken count of 139
; %6 = {%0,+,560}<nuw><%4>
; %9 = {{%0,+,560}<nuw><%4>,+,4}<nuw><%8>
%9 = phi float* [ %6, %4 ], [ %12, %8 ]
%10 = load float, float* %9, align 4, !tbaa !2
%11 = fadd float %10, 1.000000e+00
store float %11, float* %9, align 4, !tbaa !2
; %12 = {{(4 +
%0)<nsw>,+,560}<nw><%4>,+,4}<nuw><%8>
%12 = getelementptr inbounds float, float* %9, i64 1
; %7 = {(560 + %0)<nsw>,+,560}<nuw><%4>
%13 = icmp eq float* %12, %7
br i1 %13, label %14, label %8
; <label>:14: ; preds = %8
; loop<%4> at depth 1 with exact backedge-taken count of 131
; %5 = {%0,+,560}<nuw><%4>
; %15 = {(560 + %0)<nsw>,+,560}<nuw><%4>
%15 = getelementptr inbounds [140 x float], [140 x float]* %5, i64 1
; %3 = (73920 + %0)<nsw>
%16 = icmp eq [140 x float]* %15, %3
br i1 %16, label %17, label %4
; <label>:17: ; preds = %14
ret void
}
Where we only have addition by a constant (I also got rid of the induction
variables, but that's just being fancy). Do we have a way to do that
automatically?
Why don't we? (I'm guessing because that's negligible on X86)
--
*Alexandre Isoard*
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20180524/99e18f21/attachment.html>