On Wed, Sep 2, 2015 at 5:36 AM, James Molloy <james at jamesmolloy.co.uk>
wrote:> Hi,
>
> Coremark really isn't a good enough test - have you run the LLVM test
suite
> with this patch, and what were the performance differences?
For the test suite single source benches, the 235 tests improved
performance, 2 regressed and 705 were unchanged. That seems very
optimistic. Comparing consecutive runs with identical setting shows
there is a lot of noise in the performance data. Tips for stable
results would be appreciated.
> I'm still a bit confused about what pattern exactly this pass is
supposed to
> trigger on. I understand the mechanics, but I still can't quite see
what
> patterns it would be useful on. You've mentioned matrix multiply - how
does
> this pass alter the IR?
Here's before and after IR for the matrix_mul example. Notice the two
bitcasts %1 and %2 generated in the for.cond.cleanup block. The L.E.V
pass converts these to scevgep values that already exist.
*** Code after LSR ***
; Function Attrs: nounwind optsize
define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture
readonly %Src, i32 %Val) #0 {
entry:
%cmp.25 = icmp eq i32 %Size, 0
br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader
for.body.4.lr.ph.preheader: ; preds = %entry
%0 = shl i32 %Size, 2
br label %for.body.4.lr.ph
for.body.4.lr.ph: ; preds
%for.body.4.lr.ph.preheader, %for.cond.cleanup.3
%lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ %2,
%for.cond.cleanup.3 ]
%lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ %1,
%for.cond.cleanup.3 ]
%Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0,
%for.body.4.lr.ph.preheader ]
%lsr.iv56 = bitcast i32* %lsr.iv5 to i1*
%lsr.iv12 = bitcast i32* %lsr.iv1 to i1*
br label %for.body.4
for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3
br label %for.cond.cleanup
for.cond.cleanup: ; preds
%for.cond.cleanup.loopexit, %entry
ret void
for.cond.cleanup.3: ; preds = %for.body.4
%inc10 = add nuw nsw i32 %Outer.026, 1
%scevgep = getelementptr i1, i1* %lsr.iv12, i32 %0
%1 = bitcast i1* %scevgep to i32*
%scevgep7 = getelementptr i1, i1* %lsr.iv56, i32 %0
%2 = bitcast i1* %scevgep7 to i32*
%exitcond27 = icmp eq i32 %inc10, %Size
br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph
for.body.4: ; preds %for.body.4,
%for.body.4.lr.ph
%lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5,
%for.body.4.lr.ph ]
%lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1,
%for.body.4.lr.ph ]
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ]
%3 = load i32, i32* %lsr.iv8, align 4, !tbaa !1
%mul5 = mul i32 %3, %Val
store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1
%lsr.iv.next = add i32 %lsr.iv, -1
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1
%scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1
%exitcond = icmp eq i32 %lsr.iv.next, 0
br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4
}
*** Code after Loop Exit Values Optimization **
; Function Attrs: nounwind optsize
define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture
readonly %Src, i32 %Val) #0 {
entry:
%cmp.25 = icmp eq i32 %Size, 0
br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader
for.body.4.lr.ph.preheader: ; preds = %entry
br label %for.body.4.lr.ph
for.body.4.lr.ph: ; preds
%for.body.4.lr.ph.preheader, %for.cond.cleanup.3
%lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [
%scevgep9, %for.cond.cleanup.3 ]
%lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [
%scevgep4, %for.cond.cleanup.3 ]
%Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0,
%for.body.4.lr.ph.preheader ]
br label %for.body.4
for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3
br label %for.cond.cleanup
for.cond.cleanup: ; preds
%for.cond.cleanup.loopexit, %entry
ret void
for.cond.cleanup.3: ; preds = %for.body.4
%inc10 = add nuw nsw i32 %Outer.026, 1
%exitcond27 = icmp eq i32 %inc10, %Size
br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph
for.body.4: ; preds %for.body.4,
%for.body.4.lr.ph
%lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5,
%for.body.4.lr.ph ]
%lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1,
%for.body.4.lr.ph ]
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ]
%0 = load i32, i32* %lsr.iv8, align 4, !tbaa !1
%mul5 = mul i32 %0, %Val
store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1
%lsr.iv.next = add i32 %lsr.iv, -1
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1
%scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1
%exitcond = icmp eq i32 %lsr.iv.next, 0
br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4
> What value is it avoiding being recomputed?
I'm not precisely sure, but it's residue from LSR. The pass checks
all computable SCEV values when a loop exits and in this case found
GEPs with the same value.
> How does this pass affect register pressure?
> Also, your example just removes a mov and an add - the push/pops are just
> register allocation (unless your pass in fact *reduces* register pressure?)
Right, the computation eliminated is the mov and add. Register
savings is a byproduct.
Regards,
-steve