Hal Finkel
2012-Jan-26 21:41 UTC
[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote:> arm-none-linux-gnueabiIndeed, adding -ccc-host-triple arm-none-linux-gnueabi I also get vectorization (even though I don't get vectorization when targeting x86_64). I'll let you know what I find. -Hal -- Hal Finkel Postdoctoral Appointee Leadership Computing Facility Argonne National Laboratory
Sebastian Pop
2012-Jan-26 21:49 UTC
[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
On Thu, Jan 26, 2012 at 3:41 PM, Hal Finkel <hfinkel at anl.gov> wrote:> On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote: >> arm-none-linux-gnueabi > > Indeed, adding -ccc-host-triple arm-none-linux-gnueabi I also getMinor remark: please use -target instead of -ccc-host-triple that is now deprecated. Thanks for looking at this testcase. Sebastian -- Qualcomm Innovation Center, Inc is a member of Code Aurora Forum
Hal Finkel
2012-Jan-26 23:20 UTC
[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
On Thu, 2012-01-26 at 15:49 -0600, Sebastian Pop wrote:> On Thu, Jan 26, 2012 at 3:41 PM, Hal Finkel <hfinkel at anl.gov> wrote: > > On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote: > >> arm-none-linux-gnueabiFor what cpu are you compiling? I think this may be a case where not having information on exactly what can be vectorized on the backend my be hurting us. The LLVM output looks okay (attached), but it may be that the post-legalization optimizations are just not good enough to undo the damage done by an unfortunate selection of instructions to vectorize. The options available in the pass currently are fairly coarse, but please try setting them as appropriate for your cpu and see if that makes a difference: -bb-vectorize-aligned-only - Only generate aligned loads and stores -bb-vectorize-no-casts - Don't try to vectorize casting (conversion) operations -bb-vectorize-no-floats - Don't try to vectorize floating-point values -bb-vectorize-no-fma - Don't try to vectorize the fused-multiply-add intrinsic -bb-vectorize-no-ints - Don't try to vectorize integer values -bb-vectorize-no-math - Don't try to vectorize floating-point math intrinsics -bb-vectorize-no-mem-ops - Don't try to vectorize loads and stores -bb-vectorize-vector-bits=<uint> - The size of the native vector registers (128 is the default)> > > > Indeed, adding -ccc-host-triple arm-none-linux-gnueabi I also get > > Minor remark: please use -target instead of -ccc-host-triple that is > now deprecated.Correct, thanks! -Hal> > Thanks for looking at this testcase. > Sebastian > -- > Qualcomm Innovation Center, Inc is a member of Code Aurora Forum-- Hal Finkel Postdoctoral Appointee Leadership Computing Facility Argonne National Laboratory -------------- next part -------------- ; ModuleID = 'test-s-20120126.c' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" target triple = "armv4t-none-linux-gnueabi" %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } %struct.timeval = type { i32, i32 } %struct.timezone = type { i32, i32 } @stdout = external global %struct._IO_FILE* @.str = private unnamed_addr constant [35 x i8] c"kernel execution time: %18.9f sec\0A\00", align 1 define i32 @main() nounwind { entry: %start = alloca %struct.timeval, align 4 %end = alloca %struct.timeval, align 4 %call = call noalias i8* @malloc(i32 30000) nounwind %call1 = call noalias i8* @malloc(i32 30000) nounwind br label %for.body for.body: ; preds = %for.body, %entry %i.068 = phi i32 [ 0, %entry ], [ %inc, %for.body ] %x.067 = phi i8* [ %call, %entry ], [ %incdec.ptr, %for.body ] %conv = trunc i32 %i.068 to i8 %incdec.ptr = getelementptr inbounds i8* %x.067, i32 1 store i8 %conv, i8* %x.067, align 1, !tbaa !0 %inc = add nsw i32 %i.068, 1 %exitcond70 = icmp eq i32 %inc, 30000 br i1 %exitcond70, label %for.end, label %for.body for.end: ; preds = %for.body %call2 = call i32 @gettimeofday(%struct.timeval* %start, %struct.timezone* null) nounwind br label %for.cond7.preheader for.cond7.preheader: ; preds = %for.inc45, %for.end %k.066 = phi i32 [ 0, %for.end ], [ %inc46, %for.inc45 ] br label %for.body10 for.body10: ; preds = %for.body10, %for.cond7.preheader %w.065 = phi i8* [ %call1, %for.cond7.preheader ], [ %incdec.ptr41, %for.body10 ] %i.164 = phi i32 [ 0, %for.cond7.preheader ], [ %inc43, %for.body10 ] %r.063 = phi i8* [ %call, %for.cond7.preheader ], [ %incdec.ptr13, %for.body10 ] %incdec.ptr11 = getelementptr inbounds i8* %r.063, i32 1 %0 = load i8* %r.063, align 1, !tbaa !0 %incdec.ptr12 = getelementptr inbounds i8* %r.063, i32 2 %1 = load i8* %incdec.ptr11, align 1, !tbaa !0 %incdec.ptr13 = getelementptr inbounds i8* %r.063, i32 3 %2 = load i8* %incdec.ptr12, align 1, !tbaa !0 %conv14 = zext i8 %0 to i32 %mul = mul nsw i32 %conv14, 123 %conv15 = zext i8 %1 to i32 %mul16 = mul nsw i32 %conv15, 321 %conv17 = zext i8 %2 to i32 %mul18 = mul nsw i32 %conv17, 567 %add = add i32 %mul16, %mul %add19 = add i32 %add, %mul18 %conv20 = trunc i32 %add19 to i8 %incdec.ptr21 = getelementptr inbounds i8* %w.065, i32 1 store i8 %conv20, i8* %w.065, align 1, !tbaa !0 %mul23 = mul nsw i32 %conv14, 234 %mul25 = mul nsw i32 %conv15, 432 %mul28 = mul nsw i32 %conv17, 987 %add26 = add i32 %mul25, %mul23 %add29 = add i32 %add26, %mul28 %conv30 = trunc i32 %add29 to i8 %incdec.ptr31 = getelementptr inbounds i8* %w.065, i32 2 store i8 %conv30, i8* %incdec.ptr21, align 1, !tbaa !0 %mul33 = mul nsw i32 %conv14, 345 %mul35 = mul nsw i32 %conv15, 543 %mul38 = mul nsw i32 %conv17, 789 %add36 = add i32 %mul35, %mul33 %add39 = add i32 %add36, %mul38 %conv40 = trunc i32 %add39 to i8 %incdec.ptr41 = getelementptr inbounds i8* %w.065, i32 3 store i8 %conv40, i8* %incdec.ptr31, align 1, !tbaa !0 %inc43 = add nsw i32 %i.164, 1 %exitcond = icmp eq i32 %inc43, 10000 br i1 %exitcond, label %for.inc45, label %for.body10 for.inc45: ; preds = %for.body10 %inc46 = add nsw i32 %k.066, 1 %exitcond69 = icmp eq i32 %inc46, 10000 br i1 %exitcond69, label %for.end47, label %for.cond7.preheader for.end47: ; preds = %for.inc45 %call48 = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone* null) nounwind %tv_sec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 0 %3 = load i32* %tv_sec, align 4, !tbaa !2 %tv_sec49 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 0 %4 = load i32* %tv_sec49, align 4, !tbaa !2 %sub = sub nsw i32 %3, %4 %mul50 = mul nsw i32 %sub, 1000000 %conv51 = sext i32 %mul50 to i64 %tv_usec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 1 %5 = load i32* %tv_usec, align 4, !tbaa !2 %tv_usec53 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 1 %6 = load i32* %tv_usec53, align 4, !tbaa !2 %sub54 = sub nsw i32 %5, %6 %conv55 = sext i32 %sub54 to i64 %add56 = add i64 %conv55, %conv51 %7 = load %struct._IO_FILE** @stdout, align 4, !tbaa !3 %conv57 = uitofp i64 %add56 to double %div = fdiv double %conv57, 1.000000e+06 %call58 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([35 x i8]* @.str, i32 0, i32 0), double %div) nounwind %arrayidx = getelementptr inbounds i8* %call1, i32 12 %8 = load i8* %arrayidx, align 1, !tbaa !0 %conv59 = zext i8 %8 to i32 %arrayidx60 = getelementptr inbounds i8* %call1, i32 9988 %9 = load i8* %arrayidx60, align 1, !tbaa !0 %conv61 = zext i8 %9 to i32 %add62 = add nsw i32 %conv61, %conv59 ret i32 %add62 } declare noalias i8* @malloc(i32) nounwind declare i32 @gettimeofday(%struct.timeval*, %struct.timezone*) nounwind declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) nounwind !0 = metadata !{metadata !"omnipotent char", metadata !1} !1 = metadata !{metadata !"Simple C/C++ TBAA", null} !2 = metadata !{metadata !"long", metadata !0} !3 = metadata !{metadata !"any pointer", metadata !0} -------------- next part -------------- ; ModuleID = 'test-s-20120126.c' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" target triple = "armv4t-none-linux-gnueabi" %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } %struct.timeval = type { i32, i32 } %struct.timezone = type { i32, i32 } @stdout = external global %struct._IO_FILE* @.str = private unnamed_addr constant [35 x i8] c"kernel execution time: %18.9f sec\0A\00", align 1 define i32 @main() nounwind { entry: %start = alloca %struct.timeval, align 4 %end = alloca %struct.timeval, align 4 %call = call noalias i8* @malloc(i32 30000) nounwind %call1 = call noalias i8* @malloc(i32 30000) nounwind br label %for.body for.body: ; preds = %for.body, %entry %i.068 = phi i32 [ 0, %entry ], [ %inc, %for.body ] %x.067 = phi i8* [ %call, %entry ], [ %incdec.ptr, %for.body ] %conv = trunc i32 %i.068 to i8 %incdec.ptr = getelementptr inbounds i8* %x.067, i32 1 store i8 %conv, i8* %x.067, align 1, !tbaa !0 %inc = add nsw i32 %i.068, 1 %exitcond70 = icmp eq i32 %inc, 30000 br i1 %exitcond70, label %for.end, label %for.body for.end: ; preds = %for.body %call2 = call i32 @gettimeofday(%struct.timeval* %start, %struct.timezone* null) nounwind br label %for.cond7.preheader for.cond7.preheader: ; preds = %for.inc45, %for.end %k.066 = phi i32 [ 0, %for.end ], [ %inc46, %for.inc45 ] br label %for.body10 for.body10: ; preds = %for.body10, %for.cond7.preheader %w.065 = phi i8* [ %call1, %for.cond7.preheader ], [ %incdec.ptr41, %for.body10 ] %i.164 = phi i32 [ 0, %for.cond7.preheader ], [ %inc43, %for.body10 ] %r.063 = phi i8* [ %call, %for.cond7.preheader ], [ %incdec.ptr13, %for.body10 ] %0 = bitcast i8* %r.063 to <2 x i8>* %incdec.ptr12 = getelementptr inbounds i8* %r.063, i32 2 %1 = load <2 x i8>* %0, align 1, !tbaa !0 %incdec.ptr13 = getelementptr inbounds i8* %r.063, i32 3 %2 = load i8* %incdec.ptr12, align 1, !tbaa !0 %conv14 = zext <2 x i8> %1 to <2 x i32> %mul = mul nsw <2 x i32> %conv14, <i32 123, i32 321> %mul.v.r1 = extractelement <2 x i32> %mul, i32 0 %mul.v.r2 = extractelement <2 x i32> %mul, i32 1 %conv17 = zext i8 %2 to i32 %mul18 = mul nsw i32 %conv17, 567 %add = add i32 %mul.v.r2, %mul.v.r1 %add19 = add i32 %add, %mul18 %conv20 = trunc i32 %add19 to i8 %incdec.ptr21 = getelementptr inbounds i8* %w.065, i32 1 store i8 %conv20, i8* %w.065, align 1, !tbaa !0 %mul28 = mul nsw i32 %conv17, 987 %mul25.v.i0 = shufflevector <2 x i32> %conv14, <2 x i32> undef, <2 x i32> <i32 1, i32 0> %mul25 = mul nsw <2 x i32> %mul25.v.i0, <i32 432, i32 345> %mul23 = mul nsw <2 x i32> %conv14, <i32 234, i32 543> %mul38 = mul nsw i32 %conv17, 789 %add26.v.i1 = shufflevector <2 x i32> %mul23, <2 x i32> %mul25, <2 x i32> <i32 0, i32 3> %add26.v.i0 = shufflevector <2 x i32> %mul25, <2 x i32> %mul23, <2 x i32> <i32 0, i32 3> %add26 = add <2 x i32> %add26.v.i0, %add26.v.i1 %add29.v.i1.1 = insertelement <2 x i32> undef, i32 %mul28, i32 0 %add29.v.i1.2 = insertelement <2 x i32> %add29.v.i1.1, i32 %mul38, i32 1 %add29 = add <2 x i32> %add26, %add29.v.i1.2 %conv30 = trunc <2 x i32> %add29 to <2 x i8> %3 = bitcast i8* %incdec.ptr21 to <2 x i8>* %incdec.ptr41 = getelementptr inbounds i8* %w.065, i32 3 store <2 x i8> %conv30, <2 x i8>* %3, align 1, !tbaa !0 %inc43 = add nsw i32 %i.164, 1 %exitcond = icmp eq i32 %inc43, 10000 br i1 %exitcond, label %for.inc45, label %for.body10 for.inc45: ; preds = %for.body10 %inc46 = add nsw i32 %k.066, 1 %exitcond69 = icmp eq i32 %inc46, 10000 br i1 %exitcond69, label %for.end47, label %for.cond7.preheader for.end47: ; preds = %for.inc45 %call48 = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone* null) nounwind %tv_sec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 0 %4 = load i32* %tv_sec, align 4, !tbaa !2 %tv_sec49 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 0 %5 = load i32* %tv_sec49, align 4, !tbaa !2 %sub = sub nsw i32 %4, %5 %mul50 = mul nsw i32 %sub, 1000000 %conv51 = sext i32 %mul50 to i64 %tv_usec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 1 %6 = load i32* %tv_usec, align 4, !tbaa !2 %tv_usec53 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 1 %7 = load i32* %tv_usec53, align 4, !tbaa !2 %sub54 = sub nsw i32 %6, %7 %conv55 = sext i32 %sub54 to i64 %add56 = add i64 %conv55, %conv51 %8 = load %struct._IO_FILE** @stdout, align 4, !tbaa !3 %conv57 = uitofp i64 %add56 to double %div = fdiv double %conv57, 1.000000e+06 %call58 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([35 x i8]* @.str, i32 0, i32 0), double %div) nounwind %arrayidx = getelementptr inbounds i8* %call1, i32 12 %9 = load i8* %arrayidx, align 1, !tbaa !0 %conv59 = zext i8 %9 to i32 %arrayidx60 = getelementptr inbounds i8* %call1, i32 9988 %10 = load i8* %arrayidx60, align 1, !tbaa !0 %conv61 = zext i8 %10 to i32 %add62 = add nsw i32 %conv61, %conv59 ret i32 %add62 } declare noalias i8* @malloc(i32) nounwind declare i32 @gettimeofday(%struct.timeval*, %struct.timezone*) nounwind declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) nounwind !0 = metadata !{metadata !"omnipotent char", metadata !1} !1 = metadata !{metadata !"Simple C/C++ TBAA", null} !2 = metadata !{metadata !"long", metadata !0} !3 = metadata !{metadata !"any pointer", metadata !0}
Reasonably Related Threads
- [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
- [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
- [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
- [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass
- [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass