On Wed, Sep 2, 2015 at 5:36 AM, James Molloy <james at jamesmolloy.co.uk> wrote:> Hi, > > Coremark really isn't a good enough test - have you run the LLVM test suite > with this patch, and what were the performance differences?For the test suite single source benches, the 235 tests improved performance, 2 regressed and 705 were unchanged. That seems very optimistic. Comparing consecutive runs with identical setting shows there is a lot of noise in the performance data. Tips for stable results would be appreciated.> I'm still a bit confused about what pattern exactly this pass is supposed to > trigger on. I understand the mechanics, but I still can't quite see what > patterns it would be useful on. You've mentioned matrix multiply - how does > this pass alter the IR?Here's before and after IR for the matrix_mul example. Notice the two bitcasts %1 and %2 generated in the for.cond.cleanup block. The L.E.V pass converts these to scevgep values that already exist. *** Code after LSR *** ; Function Attrs: nounwind optsize define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture readonly %Src, i32 %Val) #0 { entry: %cmp.25 = icmp eq i32 %Size, 0 br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader for.body.4.lr.ph.preheader: ; preds = %entry %0 = shl i32 %Size, 2 br label %for.body.4.lr.ph for.body.4.lr.ph: ; preds %for.body.4.lr.ph.preheader, %for.cond.cleanup.3 %lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ %2, %for.cond.cleanup.3 ] %lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ %1, %for.cond.cleanup.3 ] %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, %for.body.4.lr.ph.preheader ] %lsr.iv56 = bitcast i32* %lsr.iv5 to i1* %lsr.iv12 = bitcast i32* %lsr.iv1 to i1* br label %for.body.4 for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3 br label %for.cond.cleanup for.cond.cleanup: ; preds %for.cond.cleanup.loopexit, %entry ret void for.cond.cleanup.3: ; preds = %for.body.4 %inc10 = add nuw nsw i32 %Outer.026, 1 %scevgep = getelementptr i1, i1* %lsr.iv12, i32 %0 %1 = bitcast i1* %scevgep to i32* %scevgep7 = getelementptr i1, i1* %lsr.iv56, i32 %0 %2 = bitcast i1* %scevgep7 to i32* %exitcond27 = icmp eq i32 %inc10, %Size br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph for.body.4: ; preds %for.body.4, %for.body.4.lr.ph %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5, %for.body.4.lr.ph ] %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1, %for.body.4.lr.ph ] %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ] %3 = load i32, i32* %lsr.iv8, align 4, !tbaa !1 %mul5 = mul i32 %3, %Val store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1 %lsr.iv.next = add i32 %lsr.iv, -1 %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1 %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1 %exitcond = icmp eq i32 %lsr.iv.next, 0 br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4 } *** Code after Loop Exit Values Optimization ** ; Function Attrs: nounwind optsize define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture readonly %Src, i32 %Val) #0 { entry: %cmp.25 = icmp eq i32 %Size, 0 br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader for.body.4.lr.ph.preheader: ; preds = %entry br label %for.body.4.lr.ph for.body.4.lr.ph: ; preds %for.body.4.lr.ph.preheader, %for.cond.cleanup.3 %lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ %scevgep9, %for.cond.cleanup.3 ] %lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ %scevgep4, %for.cond.cleanup.3 ] %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, %for.body.4.lr.ph.preheader ] br label %for.body.4 for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3 br label %for.cond.cleanup for.cond.cleanup: ; preds %for.cond.cleanup.loopexit, %entry ret void for.cond.cleanup.3: ; preds = %for.body.4 %inc10 = add nuw nsw i32 %Outer.026, 1 %exitcond27 = icmp eq i32 %inc10, %Size br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph for.body.4: ; preds %for.body.4, %for.body.4.lr.ph %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5, %for.body.4.lr.ph ] %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1, %for.body.4.lr.ph ] %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ] %0 = load i32, i32* %lsr.iv8, align 4, !tbaa !1 %mul5 = mul i32 %0, %Val store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1 %lsr.iv.next = add i32 %lsr.iv, -1 %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1 %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1 %exitcond = icmp eq i32 %lsr.iv.next, 0 br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4> What value is it avoiding being recomputed?I'm not precisely sure, but it's residue from LSR. The pass checks all computable SCEV values when a loop exits and in this case found GEPs with the same value.> How does this pass affect register pressure? > Also, your example just removes a mov and an add - the push/pops are just > register allocation (unless your pass in fact *reduces* register pressure?)Right, the computation eliminated is the mov and add. Register savings is a byproduct. Regards, -steve
Hello LLVM, It seems this thread has gone cold. Is there some low risk way for the community to take the new pass for a test drive? Regards, -steve On Wed, Sep 2, 2015 at 8:27 PM, Steve King <steve at metrokings.com> wrote:> On Wed, Sep 2, 2015 at 5:36 AM, James Molloy <james at jamesmolloy.co.uk> wrote: >> Hi, >> >> Coremark really isn't a good enough test - have you run the LLVM test suite >> with this patch, and what were the performance differences? > > For the test suite single source benches, the 235 tests improved > performance, 2 regressed and 705 were unchanged. That seems very > optimistic. Comparing consecutive runs with identical setting shows > there is a lot of noise in the performance data. Tips for stable > results would be appreciated. > > >> I'm still a bit confused about what pattern exactly this pass is supposed to >> trigger on. I understand the mechanics, but I still can't quite see what >> patterns it would be useful on. You've mentioned matrix multiply - how does >> this pass alter the IR? > > Here's before and after IR for the matrix_mul example. Notice the two > bitcasts %1 and %2 generated in the for.cond.cleanup block. The L.E.V > pass converts these to scevgep values that already exist. > > *** Code after LSR *** > > ; Function Attrs: nounwind optsize > define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture > readonly %Src, i32 %Val) #0 { > entry: > %cmp.25 = icmp eq i32 %Size, 0 > br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader > > for.body.4.lr.ph.preheader: ; preds = %entry > %0 = shl i32 %Size, 2 > br label %for.body.4.lr.ph > > for.body.4.lr.ph: ; preds > %for.body.4.lr.ph.preheader, %for.cond.cleanup.3 > %lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ %2, > %for.cond.cleanup.3 ] > %lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ %1, > %for.cond.cleanup.3 ] > %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, > %for.body.4.lr.ph.preheader ] > %lsr.iv56 = bitcast i32* %lsr.iv5 to i1* > %lsr.iv12 = bitcast i32* %lsr.iv1 to i1* > br label %for.body.4 > > for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3 > br label %for.cond.cleanup > > for.cond.cleanup: ; preds > %for.cond.cleanup.loopexit, %entry > ret void > > for.cond.cleanup.3: ; preds = %for.body.4 > %inc10 = add nuw nsw i32 %Outer.026, 1 > %scevgep = getelementptr i1, i1* %lsr.iv12, i32 %0 > %1 = bitcast i1* %scevgep to i32* > %scevgep7 = getelementptr i1, i1* %lsr.iv56, i32 %0 > %2 = bitcast i1* %scevgep7 to i32* > %exitcond27 = icmp eq i32 %inc10, %Size > br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph > > for.body.4: ; preds > %for.body.4, %for.body.4.lr.ph > %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5, > %for.body.4.lr.ph ] > %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1, > %for.body.4.lr.ph ] > %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ] > %3 = load i32, i32* %lsr.iv8, align 4, !tbaa !1 > %mul5 = mul i32 %3, %Val > store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1 > %lsr.iv.next = add i32 %lsr.iv, -1 > %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1 > %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1 > %exitcond = icmp eq i32 %lsr.iv.next, 0 > br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4 > } > > > *** Code after Loop Exit Values Optimization ** > > ; Function Attrs: nounwind optsize > define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture > readonly %Src, i32 %Val) #0 { > entry: > %cmp.25 = icmp eq i32 %Size, 0 > br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader > > for.body.4.lr.ph.preheader: ; preds = %entry > br label %for.body.4.lr.ph > > for.body.4.lr.ph: ; preds > %for.body.4.lr.ph.preheader, %for.cond.cleanup.3 > %lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ > %scevgep9, %for.cond.cleanup.3 ] > %lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ > %scevgep4, %for.cond.cleanup.3 ] > %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, > %for.body.4.lr.ph.preheader ] > br label %for.body.4 > > for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.3 > br label %for.cond.cleanup > > for.cond.cleanup: ; preds > %for.cond.cleanup.loopexit, %entry > ret void > > for.cond.cleanup.3: ; preds = %for.body.4 > %inc10 = add nuw nsw i32 %Outer.026, 1 > %exitcond27 = icmp eq i32 %inc10, %Size > br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph > > for.body.4: ; preds > %for.body.4, %for.body.4.lr.ph > %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5, > %for.body.4.lr.ph ] > %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1, > %for.body.4.lr.ph ] > %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ] > %0 = load i32, i32* %lsr.iv8, align 4, !tbaa !1 > %mul5 = mul i32 %0, %Val > store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1 > %lsr.iv.next = add i32 %lsr.iv, -1 > %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1 > %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1 > %exitcond = icmp eq i32 %lsr.iv.next, 0 > br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4 > > >> What value is it avoiding being recomputed? > I'm not precisely sure, but it's residue from LSR. The pass checks > all computable SCEV values when a loop exits and in this case found > GEPs with the same value. > >> How does this pass affect register pressure? >> Also, your example just removes a mov and an add - the push/pops are just >> register allocation (unless your pass in fact *reduces* register pressure?) > > Right, the computation eliminated is the mov and add. Register > savings is a byproduct. > > Regards, > -steve
Jake VanAdrighem via llvm-dev
2015-Sep-10 22:43 UTC
[llvm-dev] [RFC] New pass: LoopExitValues
Which cases does this pass handle which aren't otherwise optimized out by passes like GlobalValueNumbering or DeadCodeElimination? Thanks, Jake VanAdrighem On Thu, Sep 10, 2015 at 2:35 PM, Steve King <steve at metrokings.com> wrote:> Hello LLVM, > It seems this thread has gone cold. Is there some low risk way for > the community to take the new pass for a test drive? > Regards, > -steve > > On Wed, Sep 2, 2015 at 8:27 PM, Steve King <steve at metrokings.com> wrote: > > On Wed, Sep 2, 2015 at 5:36 AM, James Molloy <james at jamesmolloy.co.uk> > wrote: > >> Hi, > >> > >> Coremark really isn't a good enough test - have you run the LLVM test > suite > >> with this patch, and what were the performance differences? > > > > For the test suite single source benches, the 235 tests improved > > performance, 2 regressed and 705 were unchanged. That seems very > > optimistic. Comparing consecutive runs with identical setting shows > > there is a lot of noise in the performance data. Tips for stable > > results would be appreciated. > > > > > >> I'm still a bit confused about what pattern exactly this pass is > supposed to > >> trigger on. I understand the mechanics, but I still can't quite see what > >> patterns it would be useful on. You've mentioned matrix multiply - how > does > >> this pass alter the IR? > > > > Here's before and after IR for the matrix_mul example. Notice the two > > bitcasts %1 and %2 generated in the for.cond.cleanup block. The L.E.V > > pass converts these to scevgep values that already exist. > > > > *** Code after LSR *** > > > > ; Function Attrs: nounwind optsize > > define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture > > readonly %Src, i32 %Val) #0 { > > entry: > > %cmp.25 = icmp eq i32 %Size, 0 > > br i1 %cmp.25, label %for.cond.cleanup, label > %for.body.4.lr.ph.preheader > > > > for.body.4.lr.ph.preheader: ; preds = %entry > > %0 = shl i32 %Size, 2 > > br label %for.body.4.lr.ph > > > > for.body.4.lr.ph: ; preds > > %for.body.4.lr.ph.preheader, %for.cond.cleanup.3 > > %lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ %2, > > %for.cond.cleanup.3 ] > > %lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ %1, > > %for.cond.cleanup.3 ] > > %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, > > %for.body.4.lr.ph.preheader ] > > %lsr.iv56 = bitcast i32* %lsr.iv5 to i1* > > %lsr.iv12 = bitcast i32* %lsr.iv1 to i1* > > br label %for.body.4 > > > > for.cond.cleanup.loopexit: ; preds > %for.cond.cleanup.3 > > br label %for.cond.cleanup > > > > for.cond.cleanup: ; preds > > %for.cond.cleanup.loopexit, %entry > > ret void > > > > for.cond.cleanup.3: ; preds = %for.body.4 > > %inc10 = add nuw nsw i32 %Outer.026, 1 > > %scevgep = getelementptr i1, i1* %lsr.iv12, i32 %0 > > %1 = bitcast i1* %scevgep to i32* > > %scevgep7 = getelementptr i1, i1* %lsr.iv56, i32 %0 > > %2 = bitcast i1* %scevgep7 to i32* > > %exitcond27 = icmp eq i32 %inc10, %Size > > br i1 %exitcond27, label %for.cond.cleanup.loopexit, label % > for.body.4.lr.ph > > > > for.body.4: ; preds > > %for.body.4, %for.body.4.lr.ph > > %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5, > > %for.body.4.lr.ph ] > > %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1, > > %for.body.4.lr.ph ] > > %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, % > for.body.4.lr.ph ] > > %3 = load i32, i32* %lsr.iv8, align 4, !tbaa !1 > > %mul5 = mul i32 %3, %Val > > store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1 > > %lsr.iv.next = add i32 %lsr.iv, -1 > > %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1 > > %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1 > > %exitcond = icmp eq i32 %lsr.iv.next, 0 > > br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4 > > } > > > > > > *** Code after Loop Exit Values Optimization ** > > > > ; Function Attrs: nounwind optsize > > define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture > > readonly %Src, i32 %Val) #0 { > > entry: > > %cmp.25 = icmp eq i32 %Size, 0 > > br i1 %cmp.25, label %for.cond.cleanup, label > %for.body.4.lr.ph.preheader > > > > for.body.4.lr.ph.preheader: ; preds = %entry > > br label %for.body.4.lr.ph > > > > for.body.4.lr.ph: ; preds > > %for.body.4.lr.ph.preheader, %for.cond.cleanup.3 > > %lsr.iv5 = phi i32* [ %Src, %for.body.4.lr.ph.preheader ], [ > > %scevgep9, %for.cond.cleanup.3 ] > > %lsr.iv1 = phi i32* [ %Dst, %for.body.4.lr.ph.preheader ], [ > > %scevgep4, %for.cond.cleanup.3 ] > > %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, > > %for.body.4.lr.ph.preheader ] > > br label %for.body.4 > > > > for.cond.cleanup.loopexit: ; preds > %for.cond.cleanup.3 > > br label %for.cond.cleanup > > > > for.cond.cleanup: ; preds > > %for.cond.cleanup.loopexit, %entry > > ret void > > > > for.cond.cleanup.3: ; preds = %for.body.4 > > %inc10 = add nuw nsw i32 %Outer.026, 1 > > %exitcond27 = icmp eq i32 %inc10, %Size > > br i1 %exitcond27, label %for.cond.cleanup.loopexit, label % > for.body.4.lr.ph > > > > for.body.4: ; preds > > %for.body.4, %for.body.4.lr.ph > > %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %lsr.iv5, > > %for.body.4.lr.ph ] > > %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %lsr.iv1, > > %for.body.4.lr.ph ] > > %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, % > for.body.4.lr.ph ] > > %0 = load i32, i32* %lsr.iv8, align 4, !tbaa !1 > > %mul5 = mul i32 %0, %Val > > store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !1 > > %lsr.iv.next = add i32 %lsr.iv, -1 > > %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1 > > %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1 > > %exitcond = icmp eq i32 %lsr.iv.next, 0 > > br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4 > > > > > >> What value is it avoiding being recomputed? > > I'm not precisely sure, but it's residue from LSR. The pass checks > > all computable SCEV values when a loop exits and in this case found > > GEPs with the same value. > > > >> How does this pass affect register pressure? > >> Also, your example just removes a mov and an add - the push/pops are > just > >> register allocation (unless your pass in fact *reduces* register > pressure?) > > > > Right, the computation eliminated is the mov and add. Register > > savings is a byproduct. > > > > Regards, > > -steve >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150910/7e781ecb/attachment.html>