thr3ads.net - llvm dev - [LLVMdev] loop vectorizer: Unexpected extract/insertelement [Nov 2013]

If this information is useful, please help other people find it:
Share via:

Frank Winter

2013-Nov-06 16:15 UTC

[LLVMdev] loop vectorizer: Unexpected extract/insertelement

The following IR implements the following nested loop:

for (int i = start ; i < end ; ++i )
   for (int p = 0 ; p < 4 ; ++p )
     a[i*4+p] = b[i*4+p] + c[i*4+p];




define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* 
noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
entrypoint:
   br i1 %arg2, label %L0, label %L1

L0:                                               ; preds = %entrypoint
   %0 = add nsw i64 %arg0, %arg3
   %1 = add nsw i64 %arg1, %arg3
   br label %L2

L1:                                               ; preds = %entrypoint
   br label %L2

L2:                                               ; preds = %L0, %L1
   %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
   %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
   %4 = sdiv i64 %2, 4
   %5 = sdiv i64 %3, 4
   br label %L5

L3:                                               ; preds = %L3, %L5
   %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ]
   %7 = mul i64 %19, 4
   %8 = add nsw i64 %7, %6
   %9 = getelementptr float* %arg5, i64 %8
   %10 = load float* %9
   %11 = getelementptr float* %arg6, i64 %8
   %12 = load float* %11
   %13 = fadd float %12, %10
   %14 = getelementptr float* %arg4, i64 %8
   store float %13, float* %14
   %15 = add nsw i64 %6, 1
   %16 = icmp sge i64 %15, 4
   br i1 %16, label %L4, label %L3

L4:                                               ; preds = %L3
   %17 = add nsw i64 %19, 1
   %18 = icmp sge i64 %17, %5
   br i1 %18, label %L6, label %L5

L5:                                               ; preds = %L4, %L2
   %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
   br label %L3

L6:                                               ; preds = %L4
   ret void
}


L3 is the inner loop with constant trip count 4.

When calling the loop vectorizer,

opt -loop-vectorize -debug-only=loop-vectorize 
-vectorizer-min-trip-count 4 loop4.ll -S

LV:
Checking a loop in "main"
LV: Found a loop: L3
LV: Found an induction variable.
LV: We need to do 0 pointer comparisons.
LV: We don't need a runtime memory check.
LV: We can vectorize this loop!
LV: Found trip count: 4
LV: The Widest type: 32 bits.
LV: The Widest register is: 128 bits.
....
LV: Vector loop of width 4 costs: 7.
LV: Selecting VF = : 4.
LV: Found a vectorizable loop (4) in loop4.ll
LV: Unroll Factor is 1

we can see that the loop was vectorized. However, the code that is 
produced doesn't look too good:


define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* 
noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
entrypoint:
   br i1 %arg2, label %L0, label %L1

L0:                                               ; preds = %entrypoint
   %0 = add nsw i64 %arg0, %arg3
   %1 = add nsw i64 %arg1, %arg3
   br label %L2

L1:                                               ; preds = %entrypoint
   br label %L2

L2:                                               ; preds = %L1, %L0
   %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
   %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
   %4 = sdiv i64 %2, 4
   %5 = sdiv i64 %3, 4
   br label %L5

L3:                                               ; preds = %scalar.ph, %L3
   %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ]
   %7 = mul i64 %19, 4
   %8 = add nsw i64 %7, %6
   %9 = getelementptr float* %arg5, i64 %8
   %10 = load float* %9
   %11 = getelementptr float* %arg6, i64 %8
   %12 = load float* %11
   %13 = fadd float %12, %10
   %14 = getelementptr float* %arg4, i64 %8
   store float %13, float* %14
   %15 = add nsw i64 %6, 1
   %16 = icmp sge i64 %15, 4
   br i1 %16, label %L4, label %L3, !llvm.loop !0

L4:                                               ; preds = 
%middle.block, %L3
   %17 = add nsw i64 %19, 1
   %18 = icmp sge i64 %17, %5
   br i1 %18, label %L6, label %L5

L5:                                               ; preds = %L4, %L2
   %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
   br i1 false, label %middle.block, label %vector.ph

vector.ph:                                        ; preds = %L5
   %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0
   %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, 
<4 x i64> undef, <4 x i32> zeroinitializer
   br label %vector.body

vector.body:                                      ; preds = 
%vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32
0
   %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert,
<4
x i64> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2,
i64 3>
   %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64
4>
   %21 = add nsw <4 x i64> %20, %induction
   %22 = extractelement <4 x i64> %21, i32 0
   %23 = getelementptr float* %arg5, i64 %22
   %24 = insertelement <4 x float*> undef, float* %23, i32 0
   %25 = extractelement <4 x i64> %21, i32 1
   %26 = getelementptr float* %arg5, i64 %25
   %27 = insertelement <4 x float*> %24, float* %26, i32 1
   %28 = extractelement <4 x i64> %21, i32 2
   %29 = getelementptr float* %arg5, i64 %28
   %30 = insertelement <4 x float*> %27, float* %29, i32 2
   %31 = extractelement <4 x i64> %21, i32 3
   %32 = getelementptr float* %arg5, i64 %31
   %33 = insertelement <4 x float*> %30, float* %32, i32 3
   %34 = extractelement <4 x i64> %21, i32 0
   %35 = getelementptr float* %arg5, i64 %34
   %36 = getelementptr float* %35, i32 0
   %37 = bitcast float* %36 to <4 x float>*
   %wide.load = load <4 x float>* %37
   %38 = extractelement <4 x i64> %21, i32 0
   %39 = getelementptr float* %arg6, i64 %38
   %40 = insertelement <4 x float*> undef, float* %39, i32 0
   %41 = extractelement <4 x i64> %21, i32 1
   %42 = getelementptr float* %arg6, i64 %41
   %43 = insertelement <4 x float*> %40, float* %42, i32 1
   %44 = extractelement <4 x i64> %21, i32 2
   %45 = getelementptr float* %arg6, i64 %44
   %46 = insertelement <4 x float*> %43, float* %45, i32 2
   %47 = extractelement <4 x i64> %21, i32 3
   %48 = getelementptr float* %arg6, i64 %47
   %49 = insertelement <4 x float*> %46, float* %48, i32 3
   %50 = extractelement <4 x i64> %21, i32 0
   %51 = getelementptr float* %arg6, i64 %50
   %52 = getelementptr float* %51, i32 0
   %53 = bitcast float* %52 to <4 x float>*
   %wide.load3 = load <4 x float>* %53
   %54 = fadd <4 x float> %wide.load3, %wide.load
   %55 = extractelement <4 x i64> %21, i32 0
   %56 = getelementptr float* %arg4, i64 %55
   %57 = insertelement <4 x float*> undef, float* %56, i32 0
   %58 = extractelement <4 x i64> %21, i32 1
   %59 = getelementptr float* %arg4, i64 %58
   %60 = insertelement <4 x float*> %57, float* %59, i32 1
   %61 = extractelement <4 x i64> %21, i32 2
   %62 = getelementptr float* %arg4, i64 %61
   %63 = insertelement <4 x float*> %60, float* %62, i32 2
   %64 = extractelement <4 x i64> %21, i32 3
   %65 = getelementptr float* %arg4, i64 %64
   %66 = insertelement <4 x float*> %63, float* %65, i32 3
   %67 = extractelement <4 x i64> %21, i32 0
   %68 = getelementptr float* %arg4, i64 %67
   %69 = getelementptr float* %68, i32 0
   %70 = bitcast float* %69 to <4 x float>*
   store <4 x float> %54, <4 x float>* %70
   %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1>
   %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4>
   %index.next = add i64 %index, 4
   %73 = icmp eq i64 %index.next, 4
   br i1 %73, label %middle.block, label %vector.body

middle.block:                                     ; preds = 
%vector.body, %L5
   %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
   %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
   %cmp.n = icmp eq i64 4, %resume.val
   br i1 %cmp.n, label %L4, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block
   br label %L3

L6:                                               ; preds = %L4
   ret void
}



Why did the loop vectorizer produced so many insertelement and 
extractelement instructions?

I don't remember that those instructions entered when vectorizing other 
loops. Is this harmless? Which pass can clean up these instructions?

Frank

Arnold Schwaighofer

2013-Nov-06 16:30 UTC

head link

[LLVMdev] loop vectorizer: Unexpected extract/insertelement

The loop vectorizer relies on cleanup passes to be run after it:

from Transforms/IPO/PassManagerBuilder.cpp:

    // Add the various vectorization passes and relevant cleanup passes for
    // them since we are no longer in the middle of the main scalar pipeline.
    MPM.add(createLoopVectorizePass(DisableUnrollLoops));
    MPM.add(createInstructionCombiningPass());
    MPM.add(createCFGSimplificationPass());


On Nov 6, 2013, at 8:15 AM, Frank Winter <fwinter at jlab.org> wrote:
> The following IR implements the following nested loop:
> 
> for (int i = start ; i < end ; ++i )
>  for (int p = 0 ; p < 4 ; ++p )
>    a[i*4+p] = b[i*4+p] + c[i*4+p];
> 
> 
> 
> 
> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias
%arg4, float* noalias %arg5, float* noalias %arg6) {
> entrypoint:
>  br i1 %arg2, label %L0, label %L1
> 
> L0:                                               ; preds = %entrypoint
>  %0 = add nsw i64 %arg0, %arg3
>  %1 = add nsw i64 %arg1, %arg3
>  br label %L2
> 
> L1:                                               ; preds = %entrypoint
>  br label %L2
> 
> L2:                                               ; preds = %L0, %L1
>  %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
>  %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
>  %4 = sdiv i64 %2, 4
>  %5 = sdiv i64 %3, 4
>  br label %L5
> 
> L3:                                               ; preds = %L3, %L5
>  %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ]
>  %7 = mul i64 %19, 4
>  %8 = add nsw i64 %7, %6
>  %9 = getelementptr float* %arg5, i64 %8
>  %10 = load float* %9
>  %11 = getelementptr float* %arg6, i64 %8
>  %12 = load float* %11
>  %13 = fadd float %12, %10
>  %14 = getelementptr float* %arg4, i64 %8
>  store float %13, float* %14
>  %15 = add nsw i64 %6, 1
>  %16 = icmp sge i64 %15, 4
>  br i1 %16, label %L4, label %L3
> 
> L4:                                               ; preds = %L3
>  %17 = add nsw i64 %19, 1
>  %18 = icmp sge i64 %17, %5
>  br i1 %18, label %L6, label %L5
> 
> L5:                                               ; preds = %L4, %L2
>  %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
>  br label %L3
> 
> L6:                                               ; preds = %L4
>  ret void
> }
> 
> 
> L3 is the inner loop with constant trip count 4.
> 
> When calling the loop vectorizer,
> 
> opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4
loop4.ll -S
> 
> LV:
> Checking a loop in "main"
> LV: Found a loop: L3
> LV: Found an induction variable.
> LV: We need to do 0 pointer comparisons.
> LV: We don't need a runtime memory check.
> LV: We can vectorize this loop!
> LV: Found trip count: 4
> LV: The Widest type: 32 bits.
> LV: The Widest register is: 128 bits.
> ....
> LV: Vector loop of width 4 costs: 7.
> LV: Selecting VF = : 4.
> LV: Found a vectorizable loop (4) in loop4.ll
> LV: Unroll Factor is 1
> 
> we can see that the loop was vectorized. However, the code that is produced
doesn't look too good:
> 
> 
> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias
%arg4, float* noalias %arg5, float* noalias %arg6) {
> entrypoint:
>  br i1 %arg2, label %L0, label %L1
> 
> L0:                                               ; preds = %entrypoint
>  %0 = add nsw i64 %arg0, %arg3
>  %1 = add nsw i64 %arg1, %arg3
>  br label %L2
> 
> L1:                                               ; preds = %entrypoint
>  br label %L2
> 
> L2:                                               ; preds = %L1, %L0
>  %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
>  %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
>  %4 = sdiv i64 %2, 4
>  %5 = sdiv i64 %3, 4
>  br label %L5
> 
> L3:                                               ; preds = %scalar.ph, %L3
>  %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ]
>  %7 = mul i64 %19, 4
>  %8 = add nsw i64 %7, %6
>  %9 = getelementptr float* %arg5, i64 %8
>  %10 = load float* %9
>  %11 = getelementptr float* %arg6, i64 %8
>  %12 = load float* %11
>  %13 = fadd float %12, %10
>  %14 = getelementptr float* %arg4, i64 %8
>  store float %13, float* %14
>  %15 = add nsw i64 %6, 1
>  %16 = icmp sge i64 %15, 4
>  br i1 %16, label %L4, label %L3, !llvm.loop !0
> 
> L4:                                               ; preds = %middle.block,
%L3
>  %17 = add nsw i64 %19, 1
>  %18 = icmp sge i64 %17, %5
>  br i1 %18, label %L6, label %L5
> 
> L5:                                               ; preds = %L4, %L2
>  %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
>  br i1 false, label %middle.block, label %vector.ph
> 
> vector.ph:                                        ; preds = %L5
>  %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19,
i32 0
>  %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1,
<4 x i64> undef, <4 x i32> zeroinitializer
>  br label %vector.body
> 
> vector.body:                                      ; preds = %vector.body,
%vector.ph
>  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
>  %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index,
i32 0
>  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert,
<4 x i64> undef, <4 x i32> zeroinitializer
>  %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64
2, i64 3>
>  %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64
4>
>  %21 = add nsw <4 x i64> %20, %induction
>  %22 = extractelement <4 x i64> %21, i32 0
>  %23 = getelementptr float* %arg5, i64 %22
>  %24 = insertelement <4 x float*> undef, float* %23, i32 0
>  %25 = extractelement <4 x i64> %21, i32 1
>  %26 = getelementptr float* %arg5, i64 %25
>  %27 = insertelement <4 x float*> %24, float* %26, i32 1
>  %28 = extractelement <4 x i64> %21, i32 2
>  %29 = getelementptr float* %arg5, i64 %28
>  %30 = insertelement <4 x float*> %27, float* %29, i32 2
>  %31 = extractelement <4 x i64> %21, i32 3
>  %32 = getelementptr float* %arg5, i64 %31
>  %33 = insertelement <4 x float*> %30, float* %32, i32 3
>  %34 = extractelement <4 x i64> %21, i32 0
>  %35 = getelementptr float* %arg5, i64 %34
>  %36 = getelementptr float* %35, i32 0
>  %37 = bitcast float* %36 to <4 x float>*
>  %wide.load = load <4 x float>* %37
>  %38 = extractelement <4 x i64> %21, i32 0
>  %39 = getelementptr float* %arg6, i64 %38
>  %40 = insertelement <4 x float*> undef, float* %39, i32 0
>  %41 = extractelement <4 x i64> %21, i32 1
>  %42 = getelementptr float* %arg6, i64 %41
>  %43 = insertelement <4 x float*> %40, float* %42, i32 1
>  %44 = extractelement <4 x i64> %21, i32 2
>  %45 = getelementptr float* %arg6, i64 %44
>  %46 = insertelement <4 x float*> %43, float* %45, i32 2
>  %47 = extractelement <4 x i64> %21, i32 3
>  %48 = getelementptr float* %arg6, i64 %47
>  %49 = insertelement <4 x float*> %46, float* %48, i32 3
>  %50 = extractelement <4 x i64> %21, i32 0
>  %51 = getelementptr float* %arg6, i64 %50
>  %52 = getelementptr float* %51, i32 0
>  %53 = bitcast float* %52 to <4 x float>*
>  %wide.load3 = load <4 x float>* %53
>  %54 = fadd <4 x float> %wide.load3, %wide.load
>  %55 = extractelement <4 x i64> %21, i32 0
>  %56 = getelementptr float* %arg4, i64 %55
>  %57 = insertelement <4 x float*> undef, float* %56, i32 0
>  %58 = extractelement <4 x i64> %21, i32 1
>  %59 = getelementptr float* %arg4, i64 %58
>  %60 = insertelement <4 x float*> %57, float* %59, i32 1
>  %61 = extractelement <4 x i64> %21, i32 2
>  %62 = getelementptr float* %arg4, i64 %61
>  %63 = insertelement <4 x float*> %60, float* %62, i32 2
>  %64 = extractelement <4 x i64> %21, i32 3
>  %65 = getelementptr float* %arg4, i64 %64
>  %66 = insertelement <4 x float*> %63, float* %65, i32 3
>  %67 = extractelement <4 x i64> %21, i32 0
>  %68 = getelementptr float* %arg4, i64 %67
>  %69 = getelementptr float* %68, i32 0
>  %70 = bitcast float* %69 to <4 x float>*
>  store <4 x float> %54, <4 x float>* %70
>  %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64
1>
>  %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4>
>  %index.next = add i64 %index, 4
>  %73 = icmp eq i64 %index.next, 4
>  br i1 %73, label %middle.block, label %vector.body
> 
> middle.block:                                     ; preds = %vector.body,
%L5
>  %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
>  %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
>  %cmp.n = icmp eq i64 4, %resume.val
>  br i1 %cmp.n, label %L4, label %scalar.ph
> 
> scalar.ph:                                        ; preds = %middle.block
>  br label %L3
> 
> L6:                                               ; preds = %L4
>  ret void
> }
> 
> 
> 
> Why did the loop vectorizer produced so many insertelement and
extractelement instructions?
> 
> I don't remember that those instructions entered when vectorizing other
loops. Is this harmless? Which pass can clean up these instructions?
> 
> Frank
> 
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Frank Winter

2013-Nov-06 16:39 UTC

head link

[LLVMdev] loop vectorizer: Unexpected extract/insertelement

The instcombine pass cleans up a lot.

Any idea why there are still shufflevector, insertelement, *and* bitcast 
(!!) etc. instructions left? The original loop is so clean, a textbook 
example I'd say. There is no need to shuffle anything.At least I don't 
see it.

Frank


vector.ph:                                        ; preds = %L5
   %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0
   %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, 
<4 x i64> undef, <4 x i32> zeroinitializer
   br label %vector.body

vector.body:                                      ; preds = 
%vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32
0
   %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert,
<4
x i64> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2,
i64 3>
   %20 = shl <4 x i64> %broadcast.splat2, <i64 2, i64 2, i64 2, i64
2>
   %21 = add nsw <4 x i64> %20, %induction
   %22 = extractelement <4 x i64> %21, i32 0
   %23 = getelementptr float* %arg5, i64 %22
   %24 = bitcast float* %23 to <4 x float>*
   %wide.load = load <4 x float>* %24, align 16
   %25 = extractelement <4 x i64> %21, i32 0
   %26 = getelementptr float* %arg6, i64 %25
   %27 = bitcast float* %26 to <4 x float>*
   %wide.load3 = load <4 x float>* %27, align 16
   %28 = fadd <4 x float> %wide.load3, %wide.load
   %29 = extractelement <4 x i64> %21, i32 0
   %30 = getelementptr float* %arg4, i64 %29
   %31 = bitcast float* %30 to <4 x float>*
   store <4 x float> %28, <4 x float>* %31, align 16
   %index.next = add i64 %index, 4
   %32 = icmp eq i64 %index, 0
   br i1 %32, label %middle.block, label %vector.body

middle.block:                                     ; preds = 
%vector.body, %L5
   %resume.val = phi i1 [ false, %L5 ], [ true, %vector.body ]
   %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
   br i1 %resume.val, label %L4, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block
   br label %L3






On 06/11/13 11:30, Arnold Schwaighofer wrote:> The loop vectorizer relies on cleanup passes to be run after it:
>
> from Transforms/IPO/PassManagerBuilder.cpp:
>
>      // Add the various vectorization passes and relevant cleanup passes
for
>      // them since we are no longer in the middle of the main scalar
pipeline.
>      MPM.add(createLoopVectorizePass(DisableUnrollLoops));
>      MPM.add(createInstructionCombiningPass());
>      MPM.add(createCFGSimplificationPass());
>
>
> On Nov 6, 2013, at 8:15 AM, Frank Winter <fwinter at jlab.org> wrote:
>
>> The following IR implements the following nested loop:
>>
>> for (int i = start ; i < end ; ++i )
>>   for (int p = 0 ; p < 4 ; ++p )
>>     a[i*4+p] = b[i*4+p] + c[i*4+p];
>>
>>
>>
>>
>> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float*
noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
>> entrypoint:
>>   br i1 %arg2, label %L0, label %L1
>>
>> L0:                                               ; preds = %entrypoint
>>   %0 = add nsw i64 %arg0, %arg3
>>   %1 = add nsw i64 %arg1, %arg3
>>   br label %L2
>>
>> L1:                                               ; preds = %entrypoint
>>   br label %L2
>>
>> L2:                                               ; preds = %L0, %L1
>>   %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
>>   %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
>>   %4 = sdiv i64 %2, 4
>>   %5 = sdiv i64 %3, 4
>>   br label %L5
>>
>> L3:                                               ; preds = %L3, %L5
>>   %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ]
>>   %7 = mul i64 %19, 4
>>   %8 = add nsw i64 %7, %6
>>   %9 = getelementptr float* %arg5, i64 %8
>>   %10 = load float* %9
>>   %11 = getelementptr float* %arg6, i64 %8
>>   %12 = load float* %11
>>   %13 = fadd float %12, %10
>>   %14 = getelementptr float* %arg4, i64 %8
>>   store float %13, float* %14
>>   %15 = add nsw i64 %6, 1
>>   %16 = icmp sge i64 %15, 4
>>   br i1 %16, label %L4, label %L3
>>
>> L4:                                               ; preds = %L3
>>   %17 = add nsw i64 %19, 1
>>   %18 = icmp sge i64 %17, %5
>>   br i1 %18, label %L6, label %L5
>>
>> L5:                                               ; preds = %L4, %L2
>>   %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
>>   br label %L3
>>
>> L6:                                               ; preds = %L4
>>   ret void
>> }
>>
>>
>> L3 is the inner loop with constant trip count 4.
>>
>> When calling the loop vectorizer,
>>
>> opt -loop-vectorize -debug-only=loop-vectorize
-vectorizer-min-trip-count 4 loop4.ll -S
>>
>> LV:
>> Checking a loop in "main"
>> LV: Found a loop: L3
>> LV: Found an induction variable.
>> LV: We need to do 0 pointer comparisons.
>> LV: We don't need a runtime memory check.
>> LV: We can vectorize this loop!
>> LV: Found trip count: 4
>> LV: The Widest type: 32 bits.
>> LV: The Widest register is: 128 bits.
>> ....
>> LV: Vector loop of width 4 costs: 7.
>> LV: Selecting VF = : 4.
>> LV: Found a vectorizable loop (4) in loop4.ll
>> LV: Unroll Factor is 1
>>
>> we can see that the loop was vectorized. However, the code that is
produced doesn't look too good:
>>
>>
>> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float*
noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
>> entrypoint:
>>   br i1 %arg2, label %L0, label %L1
>>
>> L0:                                               ; preds = %entrypoint
>>   %0 = add nsw i64 %arg0, %arg3
>>   %1 = add nsw i64 %arg1, %arg3
>>   br label %L2
>>
>> L1:                                               ; preds = %entrypoint
>>   br label %L2
>>
>> L2:                                               ; preds = %L1, %L0
>>   %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
>>   %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
>>   %4 = sdiv i64 %2, 4
>>   %5 = sdiv i64 %3, 4
>>   br label %L5
>>
>> L3:                                               ; preds = %scalar.ph,
%L3
>>   %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ]
>>   %7 = mul i64 %19, 4
>>   %8 = add nsw i64 %7, %6
>>   %9 = getelementptr float* %arg5, i64 %8
>>   %10 = load float* %9
>>   %11 = getelementptr float* %arg6, i64 %8
>>   %12 = load float* %11
>>   %13 = fadd float %12, %10
>>   %14 = getelementptr float* %arg4, i64 %8
>>   store float %13, float* %14
>>   %15 = add nsw i64 %6, 1
>>   %16 = icmp sge i64 %15, 4
>>   br i1 %16, label %L4, label %L3, !llvm.loop !0
>>
>> L4:                                               ; preds =
%middle.block, %L3
>>   %17 = add nsw i64 %19, 1
>>   %18 = icmp sge i64 %17, %5
>>   br i1 %18, label %L6, label %L5
>>
>> L5:                                               ; preds = %L4, %L2
>>   %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
>>   br i1 false, label %middle.block, label %vector.ph
>>
>> vector.ph:                                        ; preds = %L5
>>   %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64
%19, i32 0
>>   %broadcast.splat2 = shufflevector <4 x i64>
%broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer
>>   br label %vector.body
>>
>> vector.body:                                      ; preds =
%vector.body, %vector.ph
>>   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
>>   %broadcast.splatinsert = insertelement <4 x i64> undef, i64
%index, i32 0
>>   %broadcast.splat = shufflevector <4 x i64>
%broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
>>   %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1,
i64 2, i64 3>
>>   %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4,
i64 4>
>>   %21 = add nsw <4 x i64> %20, %induction
>>   %22 = extractelement <4 x i64> %21, i32 0
>>   %23 = getelementptr float* %arg5, i64 %22
>>   %24 = insertelement <4 x float*> undef, float* %23, i32 0
>>   %25 = extractelement <4 x i64> %21, i32 1
>>   %26 = getelementptr float* %arg5, i64 %25
>>   %27 = insertelement <4 x float*> %24, float* %26, i32 1
>>   %28 = extractelement <4 x i64> %21, i32 2
>>   %29 = getelementptr float* %arg5, i64 %28
>>   %30 = insertelement <4 x float*> %27, float* %29, i32 2
>>   %31 = extractelement <4 x i64> %21, i32 3
>>   %32 = getelementptr float* %arg5, i64 %31
>>   %33 = insertelement <4 x float*> %30, float* %32, i32 3
>>   %34 = extractelement <4 x i64> %21, i32 0
>>   %35 = getelementptr float* %arg5, i64 %34
>>   %36 = getelementptr float* %35, i32 0
>>   %37 = bitcast float* %36 to <4 x float>*
>>   %wide.load = load <4 x float>* %37
>>   %38 = extractelement <4 x i64> %21, i32 0
>>   %39 = getelementptr float* %arg6, i64 %38
>>   %40 = insertelement <4 x float*> undef, float* %39, i32 0
>>   %41 = extractelement <4 x i64> %21, i32 1
>>   %42 = getelementptr float* %arg6, i64 %41
>>   %43 = insertelement <4 x float*> %40, float* %42, i32 1
>>   %44 = extractelement <4 x i64> %21, i32 2
>>   %45 = getelementptr float* %arg6, i64 %44
>>   %46 = insertelement <4 x float*> %43, float* %45, i32 2
>>   %47 = extractelement <4 x i64> %21, i32 3
>>   %48 = getelementptr float* %arg6, i64 %47
>>   %49 = insertelement <4 x float*> %46, float* %48, i32 3
>>   %50 = extractelement <4 x i64> %21, i32 0
>>   %51 = getelementptr float* %arg6, i64 %50
>>   %52 = getelementptr float* %51, i32 0
>>   %53 = bitcast float* %52 to <4 x float>*
>>   %wide.load3 = load <4 x float>* %53
>>   %54 = fadd <4 x float> %wide.load3, %wide.load
>>   %55 = extractelement <4 x i64> %21, i32 0
>>   %56 = getelementptr float* %arg4, i64 %55
>>   %57 = insertelement <4 x float*> undef, float* %56, i32 0
>>   %58 = extractelement <4 x i64> %21, i32 1
>>   %59 = getelementptr float* %arg4, i64 %58
>>   %60 = insertelement <4 x float*> %57, float* %59, i32 1
>>   %61 = extractelement <4 x i64> %21, i32 2
>>   %62 = getelementptr float* %arg4, i64 %61
>>   %63 = insertelement <4 x float*> %60, float* %62, i32 2
>>   %64 = extractelement <4 x i64> %21, i32 3
>>   %65 = getelementptr float* %arg4, i64 %64
>>   %66 = insertelement <4 x float*> %63, float* %65, i32 3
>>   %67 = extractelement <4 x i64> %21, i32 0
>>   %68 = getelementptr float* %arg4, i64 %67
>>   %69 = getelementptr float* %68, i32 0
>>   %70 = bitcast float* %69 to <4 x float>*
>>   store <4 x float> %54, <4 x float>* %70
>>   %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1,
i64 1>
>>   %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64
4>
>>   %index.next = add i64 %index, 4
>>   %73 = icmp eq i64 %index.next, 4
>>   br i1 %73, label %middle.block, label %vector.body
>>
>> middle.block:                                     ; preds =
%vector.body, %L5
>>   %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
>>   %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
>>   %cmp.n = icmp eq i64 4, %resume.val
>>   br i1 %cmp.n, label %L4, label %scalar.ph
>>
>> scalar.ph:                                        ; preds =
%middle.block
>>   br label %L3
>>
>> L6:                                               ; preds = %L4
>>   ret void
>> }
>>
>>
>>
>> Why did the loop vectorizer produced so many insertelement and
extractelement instructions?
>>
>> I don't remember that those instructions entered when vectorizing
other loops. Is this harmless? Which pass can clean up these instructions?
>>
>> Frank
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Possibly Parallel Threads

Search for more seemingly similar threads

llvm dev - Nov 2013 - [LLVMdev] loop vectorizer: Unexpected extract/insertelement

[LLVMdev] loop vectorizer: Unexpected extract/insertelement

[LLVMdev] loop vectorizer: Unexpected extract/insertelement

[LLVMdev] loop vectorizer: Unexpected extract/insertelement

Possibly Parallel Threads