Jatin Bhateja via llvm-dev
2017-Oct-26 16:54 UTC
[llvm-dev] LLVM 6.0's LoopUnroll PASS is not able to work?
Hi Leslie, There is a disable unroll meta data (!llvm.loop !2) associated with unCatN loop basic block , probaly in the source pragma clang loop unroll (disable) was used before the loop. I tried removing that and used -unroll-count=4 both the catN and uncatN were unrolled. Options : -mem2reg -loops -loop-simplify -loop-rotate -lcssa -loop-unroll -unroll-count=4 -sccp -simplifycfg -o /tmp/1 -debug-only=loop-unroll. Loop Unroll: F[catN] Loop %for.body Loop Size = 17 UNROLLING loop %for.body by 4! Loop Unroll: F[unCatN] Loop %for.body Loop Size = 9 UNROLLING loop %for.body by 4 with run-time trip count! Thanks, Jatin On Wed, Oct 25, 2017 at 10:17 AM, Leslie Zhai via llvm-dev < llvm-dev at lists.llvm.org> wrote:> Hi Michael, > > Dropped *optnone* still failed to unroll loops for this testcase: > > $ cat cat_state.n044.ll > ; ModuleID = 'cat_state.n045a.ll' > source_filename = "cat_state.n04_merged.scaffold" > target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > ; Function Attrs: noinline nounwind uwtable > define void @catN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %cmp1 = icmp sgt i32 %n, 1 > br i1 %cmp1, label %for.body.lr.ph, label %for.end > > for.body.lr.ph: ; preds = %entry > %1 = add i32 %n, -1 > %2 = add i32 %n, -2 > %xtraiter = and i32 %1, 1 > %3 = icmp ult i32 %2, 1 > br i1 %3, label %for.cond.for.end_crit_edge.unr-lcssa, label > %for.body.lr.ph.new > > for.body.lr.ph.new: ; preds = % > for.body.lr.ph > %unroll_iter = sub i32 %1, %xtraiter > br label %for.body > > for.body: ; preds = %for.body, > %for.body.lr.ph.new > %inc3 = phi i32 [ 1, %for.body.lr.ph.new ], [ %inc.1, %for.body ] > %niter = phi i32 [ %unroll_iter, %for.body.lr.ph.new ], [ %niter.nsub.1, > %for.body ] > %idxprom = sext i32 %inc3 to i64 > %arrayidx1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %4 = load i16, i16* %arrayidx1, align 2 > %sub = add nsw i32 %inc3, -1 > %idxprom2 = sext i32 %sub to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %5 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %4, i16 %5) > %inc = add nsw i32 %inc3, 1 > %niter.nsub = sub i32 %niter, 1 > %idxprom.1 = sext i32 %inc to i64 > %arrayidx1.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom.1 > %6 = load i16, i16* %arrayidx1.1, align 2 > %idxprom2.1 = sext i32 %inc3 to i64 > %arrayidx3.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.1 > %7 = load i16, i16* %arrayidx3.1, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %6, i16 %7) > %inc.1 = add nsw i32 %inc, 1 > %niter.nsub.1 = sub i32 %niter.nsub, 1 > %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0 > br i1 %niter.ncmp.1, label %for.body, label > %for.cond.for.end_crit_edge.unr-lcssa > > for.cond.for.end_crit_edge.unr-lcssa: ; preds = %for.body, % > for.body.lr.ph > %inc3.unr = phi i32 [ 1, %for.body.lr.ph ], [ %inc.1, %for.body ] > %lcmp.mod = icmp ne i32 %xtraiter, 0 > br i1 %lcmp.mod, label %for.body.epil, label %for.end > > for.body.epil: ; preds > %for.cond.for.end_crit_edge.unr-lcssa > %inc3.epil = phi i32 [ %inc3.unr, %for.cond.for.end_crit_edge.unr-lcssa > ] > %idxprom.epil = sext i32 %inc3.epil to i64 > %arrayidx1.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom.epil > %8 = load i16, i16* %arrayidx1.epil, align 2 > %sub.epil = add nsw i32 %inc3.epil, -1 > %idxprom2.epil = sext i32 %sub.epil to i64 > %arrayidx3.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.epil > %9 = load i16, i16* %arrayidx3.epil, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %8, i16 %9) > %inc.epil = add nsw i32 %inc3.epil, 1 > %cmp.epil = icmp slt i32 %inc.epil, %n > br label %for.end > > for.end: ; preds > %for.body.epil, %for.cond.for.end_crit_edge.unr-lcssa, %entry > ret void > } > > ; Function Attrs: nounwind > declare void @llvm.H.i16(i16) #1 > > ; Function Attrs: nounwind > declare void @llvm.CNOT.i16.i16(i16, i16) #1 > > ; Function Attrs: noinline nounwind uwtable > define void @unCatN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %storemerge1 = add nsw i32 %n, -1 > %cmp2 = icmp sgt i32 %n, 1 > br i1 %cmp2, label %for.body.peel, label %for.end > > for.body.peel: ; preds = %entry > %idxprom.peel = sext i32 %storemerge1 to i64 > %arrayidx.peel = getelementptr inbounds i16, i16* %bit, i64 %idxprom.peel > %0 = load i16, i16* %arrayidx.peel, align 2 > %sub1.peel = add nsw i32 %n, -2 > %idxprom2.peel = sext i32 %sub1.peel to i64 > %arrayidx3.peel = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.peel > %1 = load i16, i16* %arrayidx3.peel, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %0, i16 %1) > %storemerge.peel = add nsw i32 %storemerge1, -1 > %cmp.peel = icmp sgt i32 %storemerge1, 1 > br i1 %cmp.peel, label %for.body.lr.ph.peel.newph, label %for.end > > for.body.lr.ph.peel.newph: ; preds = %for.body.peel > br label %for.body > > for.body: ; preds = %for.body, > %for.body.lr.ph.peel.newph > %storemerge5 = phi i32 [ %storemerge.peel, %for.body.lr.ph.peel.newph ], > [ %storemerge, %for.body ] > %storemerge.in3 = phi i32 [ %storemerge1, %for.body.lr.ph.peel.newph ], > [ %storemerge5, %for.body ] > %idxprom = sext i32 %storemerge5 to i64 > %arrayidx = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %2 = load i16, i16* %arrayidx, align 2 > %sub1 = add nsw i32 %storemerge.in3, -2 > %idxprom2 = sext i32 %sub1 to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %3 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %2, i16 %3) > %storemerge = add nsw i32 %storemerge5, -1 > %cmp = icmp sgt i32 %storemerge5, 1 > br i1 %cmp, label %for.body, label %for.end, !llvm.loop !2 > > for.end: ; preds = %for.body, > %for.body.peel, %entry > %.lcssa = phi i16* [ %bit, %entry ], [ %bit, %for.body.peel ], [ %bit, > %for.body ] > %4 = load i16, i16* %.lcssa, align 2 > tail call void @llvm.H.i16(i16 %4) > ret void > } > > ; Function Attrs: noinline nounwind uwtable > define i32 @main() local_unnamed_addr #0 { > entry: > %bits = alloca [4 x i16], align 2 > %arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* %bits, i64 0, > i64 0 > call void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %arraydecay, i32 > undef) > ret i32 0 > } > > define void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %bit, i32 %n) { > entry.: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %arrayidx1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %1 = load i16, i16* %arrayidx1., align 2 > %2 = load i16, i16* %bit, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %1, i16 %2) > %arrayidx1.1. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %3 = load i16, i16* %arrayidx1.1., align 2 > %arrayidx3.1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %4 = load i16, i16* %arrayidx3.1., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %3, i16 %4) > %arrayidx1.epil. = getelementptr inbounds i16, i16* %bit, i64 3 > <https://maps.google.com/?q=i64+3&entry=gmail&source=g> > %5 = load i16, i16* %arrayidx1.epil., align 2 > %arrayidx3.epil. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %6 = load i16, i16* %arrayidx3.epil., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %5, i16 %6) > ret void > } > > attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" "less-precise-fpmad"="false" > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" > "no-infs-fp-math"="false" "no-jump-tables"="false" > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > "unsafe-fp-math"="false" "use-soft-float"="false" } > attributes #1 = { nounwind } > > !llvm.module.flags = !{!0} > !llvm.ident = !{!1} > > !0 = !{i32 1, !"wchar_size", i32 4} > !1 = !{!"clang version 6.0.0 (git at github.com:llvm-mirror/clang.git > 0aed123216ad4a38a9c2b16f1783895fd5cb1a04) (git at github.com:llvm-mirror/llvm.git > d209b37aec1e392dabbf9b5324ea4a60c36fbc55)"} > !2 = distinct !{!2, !3} > !3 = !{!"llvm.loop.unroll.disable"} > > $(OPT) -S cat_state.n044.ll -mem2reg -loops -loop-simplify -loop-rotate > -lcssa -loop-unroll -unroll-threshold=100000000 -sccp -simplifycfg -o > cat_state.n045.ll > > $ cat cat_state.n045.ll > ; ModuleID = 'cat_state.n044.ll' > source_filename = "cat_state.n04_merged.scaffold" > target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > ; Function Attrs: noinline nounwind uwtable > define void @catN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %cmp1 = icmp sgt i32 %n, 1 > br i1 %cmp1, label %for.body.lr.ph, label %for.end > > for.body.lr.ph: ; preds = %entry > %1 = add i32 %n, -1 > %2 = add i32 %n, -2 > %xtraiter = and i32 %1, 1 > %3 = icmp ult i32 %2, 1 > br i1 %3, label %for.cond.for.end_crit_edge.unr-lcssa, label > %for.body.lr.ph.new > > for.body.lr.ph.new: ; preds = % > for.body.lr.ph > %unroll_iter = sub i32 %1, %xtraiter > br label %for.body > > for.body: ; preds = %for.body, > %for.body.lr.ph.new > %inc3 = phi i32 [ 1, %for.body.lr.ph.new ], [ %inc.1, %for.body ] > %niter = phi i32 [ %unroll_iter, %for.body.lr.ph.new ], [ %niter.nsub.1, > %for.body ] > %idxprom = sext i32 %inc3 to i64 > %arrayidx1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %4 = load i16, i16* %arrayidx1, align 2 > %sub = add nsw i32 %inc3, -1 > %idxprom2 = sext i32 %sub to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %5 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %4, i16 %5) > %inc = add nsw i32 %inc3, 1 > %niter.nsub = sub i32 %niter, 1 > %idxprom.1 = sext i32 %inc to i64 > %arrayidx1.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom.1 > %6 = load i16, i16* %arrayidx1.1, align 2 > %idxprom2.1 = sext i32 %inc3 to i64 > %arrayidx3.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.1 > %7 = load i16, i16* %arrayidx3.1, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %6, i16 %7) > %inc.1 = add nsw i32 %inc, 1 > %niter.nsub.1 = sub i32 %niter.nsub, 1 > %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0 > br i1 %niter.ncmp.1, label %for.body, label > %for.cond.for.end_crit_edge.unr-lcssa > > for.cond.for.end_crit_edge.unr-lcssa: ; preds = %for.body, % > for.body.lr.ph > %inc3.unr = phi i32 [ 1, %for.body.lr.ph ], [ %inc.1, %for.body ] > %lcmp.mod = icmp ne i32 %xtraiter, 0 > br i1 %lcmp.mod, label %for.body.epil, label %for.end > > for.body.epil: ; preds > %for.cond.for.end_crit_edge.unr-lcssa > %inc3.epil = phi i32 [ %inc3.unr, %for.cond.for.end_crit_edge.unr-lcssa > ] > %idxprom.epil = sext i32 %inc3.epil to i64 > %arrayidx1.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom.epil > %8 = load i16, i16* %arrayidx1.epil, align 2 > %sub.epil = add nsw i32 %inc3.epil, -1 > %idxprom2.epil = sext i32 %sub.epil to i64 > %arrayidx3.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.epil > %9 = load i16, i16* %arrayidx3.epil, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %8, i16 %9) > %inc.epil = add nsw i32 %inc3.epil, 1 > %cmp.epil = icmp slt i32 %inc.epil, %n > br label %for.end > > for.end: ; preds > %for.body.epil, %for.cond.for.end_crit_edge.unr-lcssa, %entry > ret void > } > > ; Function Attrs: nounwind > declare void @llvm.H.i16(i16) #1 > > ; Function Attrs: nounwind > declare void @llvm.CNOT.i16.i16(i16, i16) #1 > > ; Function Attrs: noinline nounwind uwtable > define void @unCatN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %storemerge1 = add nsw i32 %n, -1 > %cmp2 = icmp sgt i32 %n, 1 > br i1 %cmp2, label %for.body.peel, label %for.end > > for.body.peel: ; preds = %entry > %idxprom.peel = sext i32 %storemerge1 to i64 > %arrayidx.peel = getelementptr inbounds i16, i16* %bit, i64 %idxprom.peel > %0 = load i16, i16* %arrayidx.peel, align 2 > %sub1.peel = add nsw i32 %n, -2 > %idxprom2.peel = sext i32 %sub1.peel to i64 > %arrayidx3.peel = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.peel > %1 = load i16, i16* %arrayidx3.peel, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %0, i16 %1) > %storemerge.peel = add nsw i32 %storemerge1, -1 > %cmp.peel = icmp sgt i32 %storemerge1, 1 > br i1 %cmp.peel, label %for.body.lr.ph.peel.newph, label %for.end > > for.body.lr.ph.peel.newph: ; preds = %for.body.peel > br label %for.body > > for.body: ; preds = %for.body, > %for.body.lr.ph.peel.newph > %storemerge5 = phi i32 [ %storemerge.peel, %for.body.lr.ph.peel.newph ], > [ %storemerge, %for.body ] > %storemerge.in3 = phi i32 [ %storemerge1, %for.body.lr.ph.peel.newph ], > [ %storemerge5, %for.body ] > %idxprom = sext i32 %storemerge5 to i64 > %arrayidx = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %2 = load i16, i16* %arrayidx, align 2 > %sub1 = add nsw i32 %storemerge.in3, -2 > %idxprom2 = sext i32 %sub1 to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %3 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %2, i16 %3) > %storemerge = add nsw i32 %storemerge5, -1 > %cmp = icmp sgt i32 %storemerge5, 1 > br i1 %cmp, label %for.body, label %for.end, !llvm.loop !2 > > for.end: ; preds = %for.body, > %for.body.peel, %entry > %.lcssa = phi i16* [ %bit, %entry ], [ %bit, %for.body.peel ], [ %bit, > %for.body ] > %4 = load i16, i16* %.lcssa, align 2 > tail call void @llvm.H.i16(i16 %4) > ret void > } > > ; Function Attrs: noinline nounwind uwtable > define i32 @main() local_unnamed_addr #0 { > entry: > %bits = alloca [4 x i16], align 2 > %arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* %bits, i64 0, > i64 0 > call void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %arraydecay, i32 > undef) > ret i32 0 > } > > define void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %bit, i32 %n) { > entry.: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %arrayidx1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %1 = load i16, i16* %arrayidx1., align 2 > %2 = load i16, i16* %bit, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %1, i16 %2) > %arrayidx1.1. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %3 = load i16, i16* %arrayidx1.1., align 2 > %arrayidx3.1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %4 = load i16, i16* %arrayidx3.1., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %3, i16 %4) > %arrayidx1.epil. = getelementptr inbounds i16, i16* %bit, i64 3 > <https://maps.google.com/?q=i64+3&entry=gmail&source=g> > %5 = load i16, i16* %arrayidx1.epil., align 2 > %arrayidx3.epil. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %6 = load i16, i16* %arrayidx3.epil., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %5, i16 %6) > ret void > } > > attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" "less-precise-fpmad"="false" > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" > "no-infs-fp-math"="false" "no-jump-tables"="false" > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > "unsafe-fp-math"="false" "use-soft-float"="false" } > attributes #1 = { nounwind } > > !llvm.module.flags = !{!0} > !llvm.ident = !{!1} > > !0 = !{i32 1, !"wchar_size", i32 4} > !1 = !{!"clang version 6.0.0 (git at github.com:llvm-mirror/clang.git > 0aed123216ad4a38a9c2b16f1783895fd5cb1a04) (git at github.com:llvm-mirror/llvm.git > d209b37aec1e392dabbf9b5324ea4a60c36fbc55)"} > !2 = distinct !{!2, !3} > !3 = !{!"llvm.loop.unroll.disable"} > > > There are still for *loops* in catN and unCatN Functions, workaround might > be using GlobalDCE PASS towards cat_state.n045.ll to remove !Live > DeadFunctions. > > Cat_State testcase: https://github.com/ScaffCC/Sca > ffCC/blob/master/Algorithms/Cat_State/cat_state.n04.scaffold > > Scaffold builtin gates: https://github.com/ScaffCC/sca > ff-clang/blob/master/include/clang/Basic/Builtins.def#L108 > > Ali JavadiAbhari, Shruti Patil, Daniel Kudrow, Jeff Heckey, Alexey Lvov, > Frederic Chong and Margaret Martonosi, ScaffCC: A Framework for Compilation > and Analysis of Quantum Computing Programs, ACM International Conference on > Computing Frontiers (CF 2014), Cagliari, Italy, May 2014 > > > 在 2017年10月24日 12:52, Michael Kruse 写道: > >> 2017-10-24 6:19 GMT+02:00 Leslie Zhai via llvm-dev < >> llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>>: >> > attributes #0 = { noinline nounwind *optnone *uwtable >> >> The optnone attribute (added by clang in -O0) says LLV? to not apply any >> transformation. Avoid with -Xclang -disable-O0-optnone >> >> Michael >> >> > -- > Regards, > Leslie Zhai - https://reviews.llvm.org/p/xiangzhai/ > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171026/30cfa42a/attachment-0001.html>
Leslie Zhai via llvm-dev
2017-Nov-01 03:45 UTC
[llvm-dev] LLVM 6.0's LoopUnroll PASS is not able to work?
Hi Jatin, Wrongly use -loop-simplify produce this issue https://github.com/epiqc/ScaffCC/issues/11 在 2017年10月27日 00:54, Jatin Bhateja 写道:> Hi Leslie, > > There is a disable unroll meta data (!llvm.loop !2) associated with > unCatN loop basic block , probaly in the source pragma clang loop > unroll (disable) was used before the loop. > > I tried removing that and used -unroll-count=4 both the catN and > uncatN were unrolled. > > Options : -mem2reg -loops -loop-simplify -loop-rotate -lcssa > -loop-unroll -unroll-count=4 -sccp -simplifycfg -o /tmp/1 > -debug-only=loop-unroll. > > Loop Unroll: F[catN] Loop %for.body > Loop Size = 17 > UNROLLING loop %for.body by 4! > Loop Unroll: F[unCatN] Loop %for.body > Loop Size = 9 > UNROLLING loop %for.body by 4 with run-time trip count! > > Thanks, > Jatin > > On Wed, Oct 25, 2017 at 10:17 AM, Leslie Zhai via llvm-dev > <llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>> wrote: > > Hi Michael, > > Dropped *optnone* still failed to unroll loops for this testcase: > > $ cat cat_state.n044.ll > ; ModuleID = 'cat_state.n045a.ll' > source_filename = "cat_state.n04_merged.scaffold" > target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > ; Function Attrs: noinline nounwind uwtable > define void @catN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %cmp1 = icmp sgt i32 %n, 1 > br i1 %cmp1, label %for.body.lr.ph <http://for.body.lr.ph>, > label %for.end > > for.body.lr.ph <http://for.body.lr.ph>: ; preds = %entry > %1 = add i32 %n, -1 > %2 = add i32 %n, -2 > %xtraiter = and i32 %1, 1 > %3 = icmp ult i32 %2, 1 > br i1 %3, label %for.cond.for.end_crit_edge.unr-lcssa, label > %for.body.lr.ph.new > > for.body.lr.ph.new: ; preds = %for.body.lr.ph > <http://for.body.lr.ph> > %unroll_iter = sub i32 %1, %xtraiter > br label %for.body > > for.body: ; preds = %for.body, %for.body.lr.ph.new > %inc3 = phi i32 [ 1, %for.body.lr.ph.new ], [ %inc.1, %for.body ] > %niter = phi i32 [ %unroll_iter, %for.body.lr.ph.new ], [ > %niter.nsub.1, %for.body ] > %idxprom = sext i32 %inc3 to i64 > %arrayidx1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %4 = load i16, i16* %arrayidx1, align 2 > %sub = add nsw i32 %inc3, -1 > %idxprom2 = sext i32 %sub to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %5 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %4, i16 %5) > %inc = add nsw i32 %inc3, 1 > %niter.nsub = sub i32 %niter, 1 > %idxprom.1 = sext i32 %inc to i64 > %arrayidx1.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom.1 > %6 = load i16, i16* %arrayidx1.1, align 2 > %idxprom2.1 = sext i32 %inc3 to i64 > %arrayidx3.1 = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.1 > %7 = load i16, i16* %arrayidx3.1, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %6, i16 %7) > %inc.1 = add nsw i32 %inc, 1 > %niter.nsub.1 = sub i32 %niter.nsub, 1 > %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0 > br i1 %niter.ncmp.1, label %for.body, label > %for.cond.for.end_crit_edge.unr-lcssa > > for.cond.for.end_crit_edge.unr-lcssa: ; preds > %for.body, %for.body.lr.ph <http://for.body.lr.ph> > %inc3.unr = phi i32 [ 1, %for.body.lr.ph <http://for.body.lr.ph> > ], [ %inc.1, %for.body ] > %lcmp.mod = icmp ne i32 %xtraiter, 0 > br i1 %lcmp.mod, label %for.body.epil, label %for.end > > for.body.epil: ; preds > %for.cond.for.end_crit_edge.unr-lcssa > %inc3.epil = phi i32 [ %inc3.unr, > %for.cond.for.end_crit_edge.unr-lcssa ] > %idxprom.epil = sext i32 %inc3.epil to i64 > %arrayidx1.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom.epil > %8 = load i16, i16* %arrayidx1.epil, align 2 > %sub.epil = add nsw i32 %inc3.epil, -1 > %idxprom2.epil = sext i32 %sub.epil to i64 > %arrayidx3.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.epil > %9 = load i16, i16* %arrayidx3.epil, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %8, i16 %9) > %inc.epil = add nsw i32 %inc3.epil, 1 > %cmp.epil = icmp slt i32 %inc.epil, %n > br label %for.end > > for.end: ; preds = %for.body.epil, > %for.cond.for.end_crit_edge.unr-lcssa, %entry > ret void > } > > ; Function Attrs: nounwind > declare void @llvm.H.i16(i16) #1 > > ; Function Attrs: nounwind > declare void @llvm.CNOT.i16.i16(i16, i16) #1 > > ; Function Attrs: noinline nounwind uwtable > define void @unCatN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %storemerge1 = add nsw i32 %n, -1 > %cmp2 = icmp sgt i32 %n, 1 > br i1 %cmp2, label %for.body.peel, label %for.end > > for.body.peel: ; preds = %entry > %idxprom.peel = sext i32 %storemerge1 to i64 > %arrayidx.peel = getelementptr inbounds i16, i16* %bit, i64 > %idxprom.peel > %0 = load i16, i16* %arrayidx.peel, align 2 > %sub1.peel = add nsw i32 %n, -2 > %idxprom2.peel = sext i32 %sub1.peel to i64 > %arrayidx3.peel = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.peel > %1 = load i16, i16* %arrayidx3.peel, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %0, i16 %1) > %storemerge.peel = add nsw i32 %storemerge1, -1 > %cmp.peel = icmp sgt i32 %storemerge1, 1 > br i1 %cmp.peel, label %for.body.lr.ph.peel.newph, label %for.end > > for.body.lr.ph.peel.newph: ; preds > %for.body.peel > br label %for.body > > for.body: ; preds = %for.body, > %for.body.lr.ph.peel.newph > %storemerge5 = phi i32 [ %storemerge.peel, > %for.body.lr.ph.peel.newph ], [ %storemerge, %for.body ] > %storemerge.in3 = phi i32 [ %storemerge1, > %for.body.lr.ph.peel.newph ], [ %storemerge5, %for.body ] > %idxprom = sext i32 %storemerge5 to i64 > %arrayidx = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %2 = load i16, i16* %arrayidx, align 2 > %sub1 = add nsw i32 %storemerge.in3, -2 > %idxprom2 = sext i32 %sub1 to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %3 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %2, i16 %3) > %storemerge = add nsw i32 %storemerge5, -1 > %cmp = icmp sgt i32 %storemerge5, 1 > br i1 %cmp, label %for.body, label %for.end, !llvm.loop !2 > > for.end: ; preds = %for.body, %for.body.peel, > %entry > %.lcssa = phi i16* [ %bit, %entry ], [ %bit, %for.body.peel ], [ > %bit, %for.body ] > %4 = load i16, i16* %.lcssa, align 2 > tail call void @llvm.H.i16(i16 %4) > ret void > } > > ; Function Attrs: noinline nounwind uwtable > define i32 @main() local_unnamed_addr #0 { > entry: > %bits = alloca [4 x i16], align 2 > %arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* > %bits, i64 0, i64 0 > call void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* > %arraydecay, i32 undef) > ret i32 0 > } > > define void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %bit, i32 %n) { > entry.: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %arrayidx1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %1 = load i16, i16* %arrayidx1., align 2 > %2 = load i16, i16* %bit, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %1, i16 %2) > %arrayidx1.1. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %3 = load i16, i16* %arrayidx1.1., align 2 > %arrayidx3.1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %4 = load i16, i16* %arrayidx3.1., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %3, i16 %4) > %arrayidx1.epil. = getelementptr inbounds i16, i16* %bit, i64 3 > <https://maps.google.com/?q=i64+3&entry=gmail&source=g> > %5 = load i16, i16* %arrayidx1.epil., align 2 > %arrayidx3.epil. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %6 = load i16, i16* %arrayidx3.epil., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %5, i16 %6) > ret void > } > > attributes #0 = { noinline nounwind uwtable > "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" "less-precise-fpmad"="false" > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" > "no-infs-fp-math"="false" "no-jump-tables"="false" > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > "target-cpu"="x86-64" > "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > "unsafe-fp-math"="false" "use-soft-float"="false" } > attributes #1 = { nounwind } > > !llvm.module.flags = !{!0} > !llvm.ident = !{!1} > > !0 = !{i32 1, !"wchar_size", i32 4} > !1 = !{!"clang version 6.0.0 (git at github.com:llvm-mirror/clang.git > 0aed123216ad4a38a9c2b16f1783895fd5cb1a04) > (git at github.com:llvm-mirror/llvm.git > d209b37aec1e392dabbf9b5324ea4a60c36fbc55)"} > !2 = distinct !{!2, !3} > !3 = !{!"llvm.loop.unroll.disable"} > > $(OPT) -S cat_state.n044.ll -mem2reg -loops -loop-simplify > -loop-rotate -lcssa -loop-unroll -unroll-threshold=100000000 -sccp > -simplifycfg -o cat_state.n045.ll > > $ cat cat_state.n045.ll > ; ModuleID = 'cat_state.n044.ll' > source_filename = "cat_state.n04_merged.scaffold" > target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > ; Function Attrs: noinline nounwind uwtable > define void @catN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %cmp1 = icmp sgt i32 %n, 1 > br i1 %cmp1, label %for.body.lr.ph <http://for.body.lr.ph>, > label %for.end > > for.body.lr.ph <http://for.body.lr.ph>: ; preds = %entry > %1 = add i32 %n, -1 > %2 = add i32 %n, -2 > %xtraiter = and i32 %1, 1 > %3 = icmp ult i32 %2, 1 > br i1 %3, label %for.cond.for.end_crit_edge.unr-lcssa, label > %for.body.lr.ph.new > > for.body.lr.ph.new: ; preds = %for.body.lr.ph > <http://for.body.lr.ph> > %unroll_iter = sub i32 %1, %xtraiter > br label %for.body > > for.body: ; preds = %for.body, %for.body.lr.ph.new > %inc3 = phi i32 [ 1, %for.body.lr.ph.new ], [ %inc.1, %for.body ] > %niter = phi i32 [ %unroll_iter, %for.body.lr.ph.new ], [ > %niter.nsub.1, %for.body ] > %idxprom = sext i32 %inc3 to i64 > %arrayidx1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %4 = load i16, i16* %arrayidx1, align 2 > %sub = add nsw i32 %inc3, -1 > %idxprom2 = sext i32 %sub to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %5 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %4, i16 %5) > %inc = add nsw i32 %inc3, 1 > %niter.nsub = sub i32 %niter, 1 > %idxprom.1 = sext i32 %inc to i64 > %arrayidx1.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom.1 > %6 = load i16, i16* %arrayidx1.1, align 2 > %idxprom2.1 = sext i32 %inc3 to i64 > %arrayidx3.1 = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.1 > %7 = load i16, i16* %arrayidx3.1, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %6, i16 %7) > %inc.1 = add nsw i32 %inc, 1 > %niter.nsub.1 = sub i32 %niter.nsub, 1 > %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0 > br i1 %niter.ncmp.1, label %for.body, label > %for.cond.for.end_crit_edge.unr-lcssa > > for.cond.for.end_crit_edge.unr-lcssa: ; preds > %for.body, %for.body.lr.ph <http://for.body.lr.ph> > %inc3.unr = phi i32 [ 1, %for.body.lr.ph <http://for.body.lr.ph> > ], [ %inc.1, %for.body ] > %lcmp.mod = icmp ne i32 %xtraiter, 0 > br i1 %lcmp.mod, label %for.body.epil, label %for.end > > for.body.epil: ; preds > %for.cond.for.end_crit_edge.unr-lcssa > %inc3.epil = phi i32 [ %inc3.unr, > %for.cond.for.end_crit_edge.unr-lcssa ] > %idxprom.epil = sext i32 %inc3.epil to i64 > %arrayidx1.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom.epil > %8 = load i16, i16* %arrayidx1.epil, align 2 > %sub.epil = add nsw i32 %inc3.epil, -1 > %idxprom2.epil = sext i32 %sub.epil to i64 > %arrayidx3.epil = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.epil > %9 = load i16, i16* %arrayidx3.epil, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %8, i16 %9) > %inc.epil = add nsw i32 %inc3.epil, 1 > %cmp.epil = icmp slt i32 %inc.epil, %n > br label %for.end > > for.end: ; preds = %for.body.epil, > %for.cond.for.end_crit_edge.unr-lcssa, %entry > ret void > } > > ; Function Attrs: nounwind > declare void @llvm.H.i16(i16) #1 > > ; Function Attrs: nounwind > declare void @llvm.CNOT.i16.i16(i16, i16) #1 > > ; Function Attrs: noinline nounwind uwtable > define void @unCatN(i16* %bit, i32 %n) local_unnamed_addr #0 { > entry: > %storemerge1 = add nsw i32 %n, -1 > %cmp2 = icmp sgt i32 %n, 1 > br i1 %cmp2, label %for.body.peel, label %for.end > > for.body.peel: ; preds = %entry > %idxprom.peel = sext i32 %storemerge1 to i64 > %arrayidx.peel = getelementptr inbounds i16, i16* %bit, i64 > %idxprom.peel > %0 = load i16, i16* %arrayidx.peel, align 2 > %sub1.peel = add nsw i32 %n, -2 > %idxprom2.peel = sext i32 %sub1.peel to i64 > %arrayidx3.peel = getelementptr inbounds i16, i16* %bit, i64 > %idxprom2.peel > %1 = load i16, i16* %arrayidx3.peel, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %0, i16 %1) > %storemerge.peel = add nsw i32 %storemerge1, -1 > %cmp.peel = icmp sgt i32 %storemerge1, 1 > br i1 %cmp.peel, label %for.body.lr.ph.peel.newph, label %for.end > > for.body.lr.ph.peel.newph: ; preds > %for.body.peel > br label %for.body > > for.body: ; preds = %for.body, > %for.body.lr.ph.peel.newph > %storemerge5 = phi i32 [ %storemerge.peel, > %for.body.lr.ph.peel.newph ], [ %storemerge, %for.body ] > %storemerge.in3 = phi i32 [ %storemerge1, > %for.body.lr.ph.peel.newph ], [ %storemerge5, %for.body ] > %idxprom = sext i32 %storemerge5 to i64 > %arrayidx = getelementptr inbounds i16, i16* %bit, i64 %idxprom > %2 = load i16, i16* %arrayidx, align 2 > %sub1 = add nsw i32 %storemerge.in3, -2 > %idxprom2 = sext i32 %sub1 to i64 > %arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2 > %3 = load i16, i16* %arrayidx3, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %2, i16 %3) > %storemerge = add nsw i32 %storemerge5, -1 > %cmp = icmp sgt i32 %storemerge5, 1 > br i1 %cmp, label %for.body, label %for.end, !llvm.loop !2 > > for.end: ; preds = %for.body, %for.body.peel, > %entry > %.lcssa = phi i16* [ %bit, %entry ], [ %bit, %for.body.peel ], [ > %bit, %for.body ] > %4 = load i16, i16* %.lcssa, align 2 > tail call void @llvm.H.i16(i16 %4) > ret void > } > > ; Function Attrs: noinline nounwind uwtable > define i32 @main() local_unnamed_addr #0 { > entry: > %bits = alloca [4 x i16], align 2 > %arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* > %bits, i64 0, i64 0 > call void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* > %arraydecay, i32 undef) > ret i32 0 > } > > define void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %bit, i32 %n) { > entry.: > %0 = load i16, i16* %bit, align 2 > tail call void @llvm.H.i16(i16 %0) > %arrayidx1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %1 = load i16, i16* %arrayidx1., align 2 > %2 = load i16, i16* %bit, align 2 > tail call void @llvm.CNOT.i16.i16(i16 %1, i16 %2) > %arrayidx1.1. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %3 = load i16, i16* %arrayidx1.1., align 2 > %arrayidx3.1. = getelementptr inbounds i16, i16* %bit, i64 1 > <https://maps.google.com/?q=i64+1&entry=gmail&source=g> > %4 = load i16, i16* %arrayidx3.1., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %3, i16 %4) > %arrayidx1.epil. = getelementptr inbounds i16, i16* %bit, i64 3 > <https://maps.google.com/?q=i64+3&entry=gmail&source=g> > %5 = load i16, i16* %arrayidx1.epil., align 2 > %arrayidx3.epil. = getelementptr inbounds i16, i16* %bit, i64 2 > <https://maps.google.com/?q=i64+2&entry=gmail&source=g> > %6 = load i16, i16* %arrayidx3.epil., align 2 > tail call void @llvm.CNOT.i16.i16(i16 %5, i16 %6) > ret void > } > > attributes #0 = { noinline nounwind uwtable > "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" "less-precise-fpmad"="false" > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" > "no-infs-fp-math"="false" "no-jump-tables"="false" > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > "target-cpu"="x86-64" > "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > "unsafe-fp-math"="false" "use-soft-float"="false" } > attributes #1 = { nounwind } > > !llvm.module.flags = !{!0} > !llvm.ident = !{!1} > > !0 = !{i32 1, !"wchar_size", i32 4} > !1 = !{!"clang version 6.0.0 (git at github.com:llvm-mirror/clang.git > 0aed123216ad4a38a9c2b16f1783895fd5cb1a04) > (git at github.com:llvm-mirror/llvm.git > d209b37aec1e392dabbf9b5324ea4a60c36fbc55)"} > !2 = distinct !{!2, !3} > !3 = !{!"llvm.loop.unroll.disable"} > > > There are still for *loops* in catN and unCatN Functions, > workaround might be using GlobalDCE PASS towards cat_state.n045.ll > to remove !Live DeadFunctions. > > Cat_State testcase: > https://github.com/ScaffCC/ScaffCC/blob/master/Algorithms/Cat_State/cat_state.n04.scaffold > <https://github.com/ScaffCC/ScaffCC/blob/master/Algorithms/Cat_State/cat_state.n04.scaffold> > > Scaffold builtin gates: > https://github.com/ScaffCC/scaff-clang/blob/master/include/clang/Basic/Builtins.def#L108 > <https://github.com/ScaffCC/scaff-clang/blob/master/include/clang/Basic/Builtins.def#L108> > > Ali JavadiAbhari, Shruti Patil, Daniel Kudrow, Jeff Heckey, Alexey > Lvov, Frederic Chong and Margaret Martonosi, ScaffCC: A Framework > for Compilation and Analysis of Quantum Computing Programs, ACM > International Conference on Computing Frontiers (CF 2014), > Cagliari, Italy, May 2014 > > > 在 2017年10月24日 12:52, Michael Kruse 写道: > > 2017-10-24 6:19 GMT+02:00 Leslie Zhai via llvm-dev > <llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org> > <mailto:llvm-dev at lists.llvm.org > <mailto:llvm-dev at lists.llvm.org>>>: > > attributes #0 = { noinline nounwind *optnone *uwtable > > The optnone attribute (added by clang in -O0) says LLV? to not > apply any transformation. Avoid with -Xclang -disable-O0-optnone > > Michael > > > -- > Regards, > Leslie Zhai - https://reviews.llvm.org/p/xiangzhai/ > <https://reviews.llvm.org/p/xiangzhai/> > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org> > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev > <http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev> > >-- Regards, Leslie Zhai - https://reviews.llvm.org/p/xiangzhai/