vivek pandya via llvm-dev
2019-Sep-14 16:15 UTC
[llvm-dev] Understanding Loop Vectorized IR
Hello, For the C code given below: #include<stdio.h> int a=0; int d() { int e = 2; for (a = 0; a <= 8; a++) ; return e; } void main() { int f = 0; d(); printf("%d\n",a); } $clang -O3 -c -emit-llvm -mllvm -disable-llvm-optzns small.c $opt -gvn -licm -loop-rotate -loop-vectorize small.bc -o small-opt.bc I see vectorized IR as follow: ; Function Attrs: nounwind uwtable define dso_local i32 @d() #0 { entry: %e = alloca i32, align 4 %0 = bitcast i32* %e to i8* call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3 store i32 2, i32* %e, align 4, !tbaa !2 store i32 0, i32* @a, align 4, !tbaa !2 %a.promoted = load i32, i32* @a, align 4, !tbaa !2 br i1 false, label %scalar.ph, label %vector.ph vector.ph: ; preds = %entry %vector.recur.init = insertelement <16 x i32> undef, i32 %a.promoted, i32 15 br label %vector.body vector.body: ; preds %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vector.recur = phi <16 x i32> [ %vector.recur.init, %vector.ph ], [ %17, %vector.body ] %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ] %1 = add i32 %index, 0 %2 = add i32 %index, 1 %3 = add i32 %index, 2 %4 = add i32 %index, 3 %5 = add i32 %index, 4 %6 = add i32 %index, 5 %7 = add i32 %index, 6 %8 = add i32 %index, 7 %9 = add i32 %index, 8 %10 = add i32 %index, 9 %11 = add i32 %index, 10 %12 = add i32 %index, 11 %13 = add i32 %index, 12 %14 = add i32 %index, 13 %15 = add i32 %index, 14 %16 = add i32 %index, 15 %17 = add nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %18 = shufflevector <16 x i32> %vector.recur, <16 x i32> %17, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> %19 = icmp ule <16 x i32> %vec.ind, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> %index.next = add i32 %index, 16 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> %20 = icmp eq i32 %index.next, 16 br i1 %20, label %middle.block, label %vector.body, !llvm.loop !6 middle.block: ; preds = %vector.body %vector.recur.extract = extractelement <16 x i32> %17, i32 15 %vector.recur.extract.for.phi = extractelement <16 x i32> %17, i32 14 br i1 true, label %for.end, label %scalar.ph scalar.ph: ; preds %middle.block, %entry %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %a.promoted, %entry ] %bc.resume.val = phi i32 [ 16, %middle.block ], [ 0, %entry ] br label %for.cond for.cond: ; preds = %for.cond, %scalar.ph %scalar.recur = phi i32 [ %inc, %for.cond ], [ %scalar.recur.init, %scalar.ph ] %21 = phi i32 [ %inc, %for.cond ], [ %bc.resume.val, %scalar.ph ] %cmp = icmp sle i32 %21, 8 %inc = add nsw i32 %21, 1 br i1 %cmp, label %for.cond, label %for.end, !llvm.loop !8 for.end: ; preds %middle.block, %for.cond %inc1.lcssa = phi i32 [ %scalar.recur, %for.cond ], [ %vector.recur.extract.for.phi, %middle.block ] store i32 %inc1.lcssa, i32* @a, align 4, !tbaa !2 call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3 ret i32 2 } As highlighted above few instructions result are never used can someone explain why? Also This gives output (variable a) 15 which is incorrect as output should 9. However I don't see any problem with vectorized code and hence a = 15 is not surprising. So solution to this problem is that loop should have never vectorized? -Vivek -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190914/ec4a2dac/attachment-0001.html>
Alexey Zhikhartsev via llvm-dev
2019-Sep-16 13:46 UTC
[llvm-dev] Understanding Loop Vectorized IR
I'm no loop vectorizer expert but it seems that it relies on instcombine to remove redundant instructions. Adding -instcombine at the end of the opt pipeline gets rid of the instructions. Alexey On Sat., Sep. 14, 2019, 12:15 p.m. vivek pandya via llvm-dev, < llvm-dev at lists.llvm.org> wrote:> Hello, > For the C code given below: > > #include<stdio.h> > int a=0; > int d() { > int e = 2; > for (a = 0; a <= 8; a++) > ; > return e; > } > void main() { > int f = 0; > d(); > printf("%d\n",a); > } > > $clang -O3 -c -emit-llvm -mllvm -disable-llvm-optzns small.c > > $opt -gvn -licm -loop-rotate -loop-vectorize small.bc -o small-opt.bc > > I see vectorized IR as follow: > ; Function Attrs: nounwind uwtable > define dso_local i32 @d() #0 { > entry: > %e = alloca i32, align 4 > %0 = bitcast i32* %e to i8* > call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3 > store i32 2, i32* %e, align 4, !tbaa !2 > store i32 0, i32* @a, align 4, !tbaa !2 > %a.promoted = load i32, i32* @a, align 4, !tbaa !2 > br i1 false, label %scalar.ph, label %vector.ph > > vector.ph: ; preds = %entry > %vector.recur.init = insertelement <16 x i32> undef, i32 %a.promoted, i32 15 > br label %vector.body > > vector.body: ; preds = %vector.body, %vector.ph > %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] > %vector.recur = phi <16 x i32> [ %vector.recur.init, %vector.ph ], [ %17, %vector.body ] > %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ] > %1 = add i32 %index, 0 > %2 = add i32 %index, 1 > %3 = add i32 %index, 2 > %4 = add i32 %index, 3 > %5 = add i32 %index, 4 > %6 = add i32 %index, 5 > %7 = add i32 %index, 6 > %8 = add i32 %index, 7 > %9 = add i32 %index, 8 > %10 = add i32 %index, 9 > %11 = add i32 %index, 10 > %12 = add i32 %index, 11 > %13 = add i32 %index, 12 > %14 = add i32 %index, 13 > %15 = add i32 %index, 14 > %16 = add i32 %index, 15 > %17 = add nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> > %18 = shufflevector <16 x i32> %vector.recur, <16 x i32> %17, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> > %19 = icmp ule <16 x i32> %vec.ind, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> > %index.next = add i32 %index, 16 > %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> > %20 = icmp eq i32 %index.next, 16 > br i1 %20, label %middle.block, label %vector.body, !llvm.loop !6 > > middle.block: ; preds = %vector.body > %vector.recur.extract = extractelement <16 x i32> %17, i32 15 > %vector.recur.extract.for.phi = extractelement <16 x i32> %17, i32 14 > br i1 true, label %for.end, label %scalar.ph > > scalar.ph: ; preds = %middle.block, %entry > %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %a.promoted, %entry ] > %bc.resume.val = phi i32 [ 16, %middle.block ], [ 0, %entry ] > br label %for.cond > > for.cond: ; preds = %for.cond, %scalar.ph > %scalar.recur = phi i32 [ %inc, %for.cond ], [ %scalar.recur.init, %scalar.ph ] > %21 = phi i32 [ %inc, %for.cond ], [ %bc.resume.val, %scalar.ph ] > %cmp = icmp sle i32 %21, 8 > %inc = add nsw i32 %21, 1 > br i1 %cmp, label %for.cond, label %for.end, !llvm.loop !8 > > for.end: ; preds = %middle.block, %for.cond > %inc1.lcssa = phi i32 [ %scalar.recur, %for.cond ], [ %vector.recur.extract.for.phi, %middle.block ] > store i32 %inc1.lcssa, i32* @a, align 4, !tbaa !2 > call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3 > ret i32 2 > } > > As highlighted above few instructions result are never used can someone explain why? > > Also This gives output (variable a) 15 which is incorrect as output should 9. However I don't see any problem with vectorized code and hence a = 15 is not surprising. > > So solution to this problem is that loop should have never vectorized? > > -Vivek > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190916/154253a9/attachment.html>