thr3ads.net - llvm dev - [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass [Jan 2012]

If this information is useful, please help other people find it:
Share via:

Hal Finkel

2012-Jan-26 21:41 UTC

[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote:> arm-none-linux-gnueabi
Indeed, adding -ccc-host-triple arm-none-linux-gnueabi I also get
vectorization (even though I don't get vectorization when targeting
x86_64). I'll let you know what I find.

 -Hal

-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory

Sebastian Pop

2012-Jan-26 21:49 UTC

head link

[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

On Thu, Jan 26, 2012 at 3:41 PM, Hal Finkel <hfinkel at anl.gov>
wrote:> On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote:
>> arm-none-linux-gnueabi
>
> Indeed, adding -ccc-host-triple arm-none-linux-gnueabi I also get
Minor remark: please use -target instead of -ccc-host-triple that is
now deprecated.

Thanks for looking at this testcase.
Sebastian
--
Qualcomm Innovation Center, Inc is a member of Code Aurora Forum

Hal Finkel

2012-Jan-26 23:20 UTC

head link

[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

On Thu, 2012-01-26 at 15:49 -0600, Sebastian Pop wrote:> On Thu, Jan 26, 2012 at 3:41 PM, Hal Finkel <hfinkel at anl.gov>
wrote:
> > On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote:
> >> arm-none-linux-gnueabi
For what cpu are you compiling?

I think this may be a case where not having information on exactly what
can be vectorized on the backend my be hurting us. The LLVM output looks
okay (attached), but it may be that the post-legalization optimizations
are just not good enough to undo the damage done by an unfortunate
selection of instructions to vectorize. The options available in the
pass currently are fairly coarse, but please try setting them as
appropriate for your cpu and see if that makes a difference:

-bb-vectorize-aligned-only - Only generate aligned loads and stores
-bb-vectorize-no-casts - Don't try to vectorize casting (conversion)
operations
-bb-vectorize-no-floats - Don't try to vectorize floating-point values
-bb-vectorize-no-fma - Don't try to vectorize the fused-multiply-add
intrinsic
-bb-vectorize-no-ints - Don't try to vectorize integer values
-bb-vectorize-no-math - Don't try to vectorize floating-point math
intrinsics
-bb-vectorize-no-mem-ops - Don't try to vectorize loads and stores
-bb-vectorize-vector-bits=<uint> - The size of the native vector
registers (128 is the default)
> >
> > Indeed, adding -ccc-host-triple arm-none-linux-gnueabi I also get
> 
> Minor remark: please use -target instead of -ccc-host-triple that is
> now deprecated.
Correct, thanks!

 -Hal
> 
> Thanks for looking at this testcase.
> Sebastian
> --
> Qualcomm Innovation Center, Inc is a member of Code Aurora Forum
-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory
-------------- next part --------------
; ModuleID = 'test-s-20120126.c'
target datalayout =
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
target triple = "armv4t-none-linux-gnueabi"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*,
i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8],
i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.timeval = type { i32, i32 }
%struct.timezone = type { i32, i32 }

@stdout = external global %struct._IO_FILE*
@.str = private unnamed_addr constant [35 x i8] c"kernel execution time:
%18.9f sec\0A\00", align 1

define i32 @main() nounwind {
entry:
  %start = alloca %struct.timeval, align 4
  %end = alloca %struct.timeval, align 4
  %call = call noalias i8* @malloc(i32 30000) nounwind
  %call1 = call noalias i8* @malloc(i32 30000) nounwind
  br label %for.body

for.body:                                         ; preds = %for.body, %entry
  %i.068 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %x.067 = phi i8* [ %call, %entry ], [ %incdec.ptr, %for.body ]
  %conv = trunc i32 %i.068 to i8
  %incdec.ptr = getelementptr inbounds i8* %x.067, i32 1
  store i8 %conv, i8* %x.067, align 1, !tbaa !0
  %inc = add nsw i32 %i.068, 1
  %exitcond70 = icmp eq i32 %inc, 30000
  br i1 %exitcond70, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  %call2 = call i32 @gettimeofday(%struct.timeval* %start, %struct.timezone*
null) nounwind
  br label %for.cond7.preheader

for.cond7.preheader:                              ; preds = %for.inc45, %for.end
  %k.066 = phi i32 [ 0, %for.end ], [ %inc46, %for.inc45 ]
  br label %for.body10

for.body10:                                       ; preds = %for.body10,
%for.cond7.preheader
  %w.065 = phi i8* [ %call1, %for.cond7.preheader ], [ %incdec.ptr41,
%for.body10 ]
  %i.164 = phi i32 [ 0, %for.cond7.preheader ], [ %inc43, %for.body10 ]
  %r.063 = phi i8* [ %call, %for.cond7.preheader ], [ %incdec.ptr13, %for.body10
]
  %incdec.ptr11 = getelementptr inbounds i8* %r.063, i32 1
  %0 = load i8* %r.063, align 1, !tbaa !0
  %incdec.ptr12 = getelementptr inbounds i8* %r.063, i32 2
  %1 = load i8* %incdec.ptr11, align 1, !tbaa !0
  %incdec.ptr13 = getelementptr inbounds i8* %r.063, i32 3
  %2 = load i8* %incdec.ptr12, align 1, !tbaa !0
  %conv14 = zext i8 %0 to i32
  %mul = mul nsw i32 %conv14, 123
  %conv15 = zext i8 %1 to i32
  %mul16 = mul nsw i32 %conv15, 321
  %conv17 = zext i8 %2 to i32
  %mul18 = mul nsw i32 %conv17, 567
  %add = add i32 %mul16, %mul
  %add19 = add i32 %add, %mul18
  %conv20 = trunc i32 %add19 to i8
  %incdec.ptr21 = getelementptr inbounds i8* %w.065, i32 1
  store i8 %conv20, i8* %w.065, align 1, !tbaa !0
  %mul23 = mul nsw i32 %conv14, 234
  %mul25 = mul nsw i32 %conv15, 432
  %mul28 = mul nsw i32 %conv17, 987
  %add26 = add i32 %mul25, %mul23
  %add29 = add i32 %add26, %mul28
  %conv30 = trunc i32 %add29 to i8
  %incdec.ptr31 = getelementptr inbounds i8* %w.065, i32 2
  store i8 %conv30, i8* %incdec.ptr21, align 1, !tbaa !0
  %mul33 = mul nsw i32 %conv14, 345
  %mul35 = mul nsw i32 %conv15, 543
  %mul38 = mul nsw i32 %conv17, 789
  %add36 = add i32 %mul35, %mul33
  %add39 = add i32 %add36, %mul38
  %conv40 = trunc i32 %add39 to i8
  %incdec.ptr41 = getelementptr inbounds i8* %w.065, i32 3
  store i8 %conv40, i8* %incdec.ptr31, align 1, !tbaa !0
  %inc43 = add nsw i32 %i.164, 1
  %exitcond = icmp eq i32 %inc43, 10000
  br i1 %exitcond, label %for.inc45, label %for.body10

for.inc45:                                        ; preds = %for.body10
  %inc46 = add nsw i32 %k.066, 1
  %exitcond69 = icmp eq i32 %inc46, 10000
  br i1 %exitcond69, label %for.end47, label %for.cond7.preheader

for.end47:                                        ; preds = %for.inc45
  %call48 = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone*
null) nounwind
  %tv_sec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 0
  %3 = load i32* %tv_sec, align 4, !tbaa !2
  %tv_sec49 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 0
  %4 = load i32* %tv_sec49, align 4, !tbaa !2
  %sub = sub nsw i32 %3, %4
  %mul50 = mul nsw i32 %sub, 1000000
  %conv51 = sext i32 %mul50 to i64
  %tv_usec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 1
  %5 = load i32* %tv_usec, align 4, !tbaa !2
  %tv_usec53 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 1
  %6 = load i32* %tv_usec53, align 4, !tbaa !2
  %sub54 = sub nsw i32 %5, %6
  %conv55 = sext i32 %sub54 to i64
  %add56 = add i64 %conv55, %conv51
  %7 = load %struct._IO_FILE** @stdout, align 4, !tbaa !3
  %conv57 = uitofp i64 %add56 to double
  %div = fdiv double %conv57, 1.000000e+06
  %call58 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE*
%7, i8* getelementptr inbounds ([35 x i8]* @.str, i32 0, i32 0), double %div)
nounwind
  %arrayidx = getelementptr inbounds i8* %call1, i32 12
  %8 = load i8* %arrayidx, align 1, !tbaa !0
  %conv59 = zext i8 %8 to i32
  %arrayidx60 = getelementptr inbounds i8* %call1, i32 9988
  %9 = load i8* %arrayidx60, align 1, !tbaa !0
  %conv61 = zext i8 %9 to i32
  %add62 = add nsw i32 %conv61, %conv59
  ret i32 %add62
}

declare noalias i8* @malloc(i32) nounwind

declare i32 @gettimeofday(%struct.timeval*, %struct.timezone*) nounwind

declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) nounwind

!0 = metadata !{metadata !"omnipotent char", metadata !1}
!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
!2 = metadata !{metadata !"long", metadata !0}
!3 = metadata !{metadata !"any pointer", metadata !0}
-------------- next part --------------
; ModuleID = 'test-s-20120126.c'
target datalayout =
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
target triple = "armv4t-none-linux-gnueabi"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*,
i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8],
i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.timeval = type { i32, i32 }
%struct.timezone = type { i32, i32 }

@stdout = external global %struct._IO_FILE*
@.str = private unnamed_addr constant [35 x i8] c"kernel execution time:
%18.9f sec\0A\00", align 1

define i32 @main() nounwind {
entry:
  %start = alloca %struct.timeval, align 4
  %end = alloca %struct.timeval, align 4
  %call = call noalias i8* @malloc(i32 30000) nounwind
  %call1 = call noalias i8* @malloc(i32 30000) nounwind
  br label %for.body

for.body:                                         ; preds = %for.body, %entry
  %i.068 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %x.067 = phi i8* [ %call, %entry ], [ %incdec.ptr, %for.body ]
  %conv = trunc i32 %i.068 to i8
  %incdec.ptr = getelementptr inbounds i8* %x.067, i32 1
  store i8 %conv, i8* %x.067, align 1, !tbaa !0
  %inc = add nsw i32 %i.068, 1
  %exitcond70 = icmp eq i32 %inc, 30000
  br i1 %exitcond70, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  %call2 = call i32 @gettimeofday(%struct.timeval* %start, %struct.timezone*
null) nounwind
  br label %for.cond7.preheader

for.cond7.preheader:                              ; preds = %for.inc45, %for.end
  %k.066 = phi i32 [ 0, %for.end ], [ %inc46, %for.inc45 ]
  br label %for.body10

for.body10:                                       ; preds = %for.body10,
%for.cond7.preheader
  %w.065 = phi i8* [ %call1, %for.cond7.preheader ], [ %incdec.ptr41,
%for.body10 ]
  %i.164 = phi i32 [ 0, %for.cond7.preheader ], [ %inc43, %for.body10 ]
  %r.063 = phi i8* [ %call, %for.cond7.preheader ], [ %incdec.ptr13, %for.body10
]
  %0 = bitcast i8* %r.063 to <2 x i8>*
  %incdec.ptr12 = getelementptr inbounds i8* %r.063, i32 2
  %1 = load <2 x i8>* %0, align 1, !tbaa !0
  %incdec.ptr13 = getelementptr inbounds i8* %r.063, i32 3
  %2 = load i8* %incdec.ptr12, align 1, !tbaa !0
  %conv14 = zext <2 x i8> %1 to <2 x i32>
  %mul = mul nsw <2 x i32> %conv14, <i32 123, i32 321>
  %mul.v.r1 = extractelement <2 x i32> %mul, i32 0
  %mul.v.r2 = extractelement <2 x i32> %mul, i32 1
  %conv17 = zext i8 %2 to i32
  %mul18 = mul nsw i32 %conv17, 567
  %add = add i32 %mul.v.r2, %mul.v.r1
  %add19 = add i32 %add, %mul18
  %conv20 = trunc i32 %add19 to i8
  %incdec.ptr21 = getelementptr inbounds i8* %w.065, i32 1
  store i8 %conv20, i8* %w.065, align 1, !tbaa !0
  %mul28 = mul nsw i32 %conv17, 987
  %mul25.v.i0 = shufflevector <2 x i32> %conv14, <2 x i32> undef,
<2 x i32> <i32 1, i32 0>
  %mul25 = mul nsw <2 x i32> %mul25.v.i0, <i32 432, i32 345>
  %mul23 = mul nsw <2 x i32> %conv14, <i32 234, i32 543>
  %mul38 = mul nsw i32 %conv17, 789
  %add26.v.i1 = shufflevector <2 x i32> %mul23, <2 x i32> %mul25,
<2 x i32> <i32 0, i32 3>
  %add26.v.i0 = shufflevector <2 x i32> %mul25, <2 x i32> %mul23,
<2 x i32> <i32 0, i32 3>
  %add26 = add <2 x i32> %add26.v.i0, %add26.v.i1
  %add29.v.i1.1 = insertelement <2 x i32> undef, i32 %mul28, i32 0
  %add29.v.i1.2 = insertelement <2 x i32> %add29.v.i1.1, i32 %mul38, i32 1
  %add29 = add <2 x i32> %add26, %add29.v.i1.2
  %conv30 = trunc <2 x i32> %add29 to <2 x i8>
  %3 = bitcast i8* %incdec.ptr21 to <2 x i8>*
  %incdec.ptr41 = getelementptr inbounds i8* %w.065, i32 3
  store <2 x i8> %conv30, <2 x i8>* %3, align 1, !tbaa !0
  %inc43 = add nsw i32 %i.164, 1
  %exitcond = icmp eq i32 %inc43, 10000
  br i1 %exitcond, label %for.inc45, label %for.body10

for.inc45:                                        ; preds = %for.body10
  %inc46 = add nsw i32 %k.066, 1
  %exitcond69 = icmp eq i32 %inc46, 10000
  br i1 %exitcond69, label %for.end47, label %for.cond7.preheader

for.end47:                                        ; preds = %for.inc45
  %call48 = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone*
null) nounwind
  %tv_sec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 0
  %4 = load i32* %tv_sec, align 4, !tbaa !2
  %tv_sec49 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 0
  %5 = load i32* %tv_sec49, align 4, !tbaa !2
  %sub = sub nsw i32 %4, %5
  %mul50 = mul nsw i32 %sub, 1000000
  %conv51 = sext i32 %mul50 to i64
  %tv_usec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 1
  %6 = load i32* %tv_usec, align 4, !tbaa !2
  %tv_usec53 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 1
  %7 = load i32* %tv_usec53, align 4, !tbaa !2
  %sub54 = sub nsw i32 %6, %7
  %conv55 = sext i32 %sub54 to i64
  %add56 = add i64 %conv55, %conv51
  %8 = load %struct._IO_FILE** @stdout, align 4, !tbaa !3
  %conv57 = uitofp i64 %add56 to double
  %div = fdiv double %conv57, 1.000000e+06
  %call58 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE*
%8, i8* getelementptr inbounds ([35 x i8]* @.str, i32 0, i32 0), double %div)
nounwind
  %arrayidx = getelementptr inbounds i8* %call1, i32 12
  %9 = load i8* %arrayidx, align 1, !tbaa !0
  %conv59 = zext i8 %9 to i32
  %arrayidx60 = getelementptr inbounds i8* %call1, i32 9988
  %10 = load i8* %arrayidx60, align 1, !tbaa !0
  %conv61 = zext i8 %10 to i32
  %add62 = add nsw i32 %conv61, %conv59
  ret i32 %add62
}

declare noalias i8* @malloc(i32) nounwind

declare i32 @gettimeofday(%struct.timeval*, %struct.timezone*) nounwind

declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) nounwind

!0 = metadata !{metadata !"omnipotent char", metadata !1}
!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
!2 = metadata !{metadata !"long", metadata !0}
!3 = metadata !{metadata !"any pointer", metadata !0}

Possibly Parallel Threads

Search for more reasonably related threads

llvm dev - Jan 2012 - [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

Possibly Parallel Threads