Hi Steve,
Do you primarily find this to help for nested loops? If so, that
could be because LSR explicitly bails out of processing them:
// Skip nested loops until we can model them better with formulae.
if (!L->empty()) {
DEBUG(dbgs() << "LSR skipping outer loop " << *L
<< "n");
return;
}
I don't know how much time you're willing to commit to this, but
perhaps a more principled fix is to change LSR to actually work with
nested loops?
If I comment out this change, after LSR the matric_mul routine does
not actually look any better (possibly even worse):
define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture readonly
%Src, i32 %Val) {
entry:
%Src12 = bitcast i32* %Src to i8*
%Dst14 = bitcast i32* %Dst to i8*
%cmp.25 = icmp eq i32 %Size, 0
br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader
for.body.4.lr.ph.preheader: ; preds = %entry
%0 = shl i32 %Size, 2
br label %for.body.4.lr.ph
for.body.4.lr.ph: ; preds =
%for.body.4.lr.ph.preheader, %for.cond.cleanup.3
%lsr.iv17 = phi i32 [ %Size, %for.body.4.lr.ph.preheader ], [ %lsr.iv.next18,
%for.cond.cleanup.3 ]
%lsr.iv10 = phi i32 [ 0, %for.body.4.lr.ph.preheader ], [ %lsr.iv.next11,
%for.cond.cleanup.3 ]
%uglygep = getelementptr i8, i8* %Src12, i32 %lsr.iv10
%uglygep13 = bitcast i8* %uglygep to i32*
%uglygep15 = getelementptr i8, i8* %Dst14, i32 %lsr.iv10
%uglygep1516 = bitcast i8* %uglygep15 to i32*
br label %for.body.4
for.body.4: ; preds = %for.body.4,
%for.body.4.lr.ph
%lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %uglygep13,
%for.body.4.lr.ph ]
%lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %uglygep1516,
%for.body.4.lr.ph ]
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ]
%1 = load i32, i32* %lsr.iv8, align 4, !tbaa !0
%mul5 = mul i32 %1, %Val
store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !0
%lsr.iv.next = add i32 %lsr.iv, -1
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1
%scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1
%exitcond = icmp eq i32 %lsr.iv.next, 0
br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4
for.cond.cleanup.3: ; preds = %for.body.4
%lsr.iv.next11 = add i32 %lsr.iv10, %0
%lsr.iv.next18 = add i32 %lsr.iv17, -1
%exitcond27 = icmp eq i32 %lsr.iv.next18, 0
br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph
for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3
br label %for.cond.cleanup
for.cond.cleanup: ; preds =
%for.cond.cleanup.loopexit, %entry
ret void
}
-- Sanjoy