Frank Winter
2013-Nov-06 16:15 UTC
[LLVMdev] loop vectorizer: Unexpected extract/insertelement
The following IR implements the following nested loop: for (int i = start ; i < end ; ++i ) for (int p = 0 ; p < 4 ; ++p ) a[i*4+p] = b[i*4+p] + c[i*4+p]; define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { entrypoint: br i1 %arg2, label %L0, label %L1 L0: ; preds = %entrypoint %0 = add nsw i64 %arg0, %arg3 %1 = add nsw i64 %arg1, %arg3 br label %L2 L1: ; preds = %entrypoint br label %L2 L2: ; preds = %L0, %L1 %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] %4 = sdiv i64 %2, 4 %5 = sdiv i64 %3, 4 br label %L5 L3: ; preds = %L3, %L5 %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ] %7 = mul i64 %19, 4 %8 = add nsw i64 %7, %6 %9 = getelementptr float* %arg5, i64 %8 %10 = load float* %9 %11 = getelementptr float* %arg6, i64 %8 %12 = load float* %11 %13 = fadd float %12, %10 %14 = getelementptr float* %arg4, i64 %8 store float %13, float* %14 %15 = add nsw i64 %6, 1 %16 = icmp sge i64 %15, 4 br i1 %16, label %L4, label %L3 L4: ; preds = %L3 %17 = add nsw i64 %19, 1 %18 = icmp sge i64 %17, %5 br i1 %18, label %L6, label %L5 L5: ; preds = %L4, %L2 %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] br label %L3 L6: ; preds = %L4 ret void } L3 is the inner loop with constant trip count 4. When calling the loop vectorizer, opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4 loop4.ll -S LV: Checking a loop in "main" LV: Found a loop: L3 LV: Found an induction variable. LV: We need to do 0 pointer comparisons. LV: We don't need a runtime memory check. LV: We can vectorize this loop! LV: Found trip count: 4 LV: The Widest type: 32 bits. LV: The Widest register is: 128 bits. .... LV: Vector loop of width 4 costs: 7. LV: Selecting VF = : 4. LV: Found a vectorizable loop (4) in loop4.ll LV: Unroll Factor is 1 we can see that the loop was vectorized. However, the code that is produced doesn't look too good: define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { entrypoint: br i1 %arg2, label %L0, label %L1 L0: ; preds = %entrypoint %0 = add nsw i64 %arg0, %arg3 %1 = add nsw i64 %arg1, %arg3 br label %L2 L1: ; preds = %entrypoint br label %L2 L2: ; preds = %L1, %L0 %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] %4 = sdiv i64 %2, 4 %5 = sdiv i64 %3, 4 br label %L5 L3: ; preds = %scalar.ph, %L3 %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ] %7 = mul i64 %19, 4 %8 = add nsw i64 %7, %6 %9 = getelementptr float* %arg5, i64 %8 %10 = load float* %9 %11 = getelementptr float* %arg6, i64 %8 %12 = load float* %11 %13 = fadd float %12, %10 %14 = getelementptr float* %arg4, i64 %8 store float %13, float* %14 %15 = add nsw i64 %6, 1 %16 = icmp sge i64 %15, 4 br i1 %16, label %L4, label %L3, !llvm.loop !0 L4: ; preds = %middle.block, %L3 %17 = add nsw i64 %19, 1 %18 = icmp sge i64 %17, %5 br i1 %18, label %L6, label %L5 L5: ; preds = %L4, %L2 %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] br i1 false, label %middle.block, label %vector.ph vector.ph: ; preds = %L5 %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64 4> %21 = add nsw <4 x i64> %20, %induction %22 = extractelement <4 x i64> %21, i32 0 %23 = getelementptr float* %arg5, i64 %22 %24 = insertelement <4 x float*> undef, float* %23, i32 0 %25 = extractelement <4 x i64> %21, i32 1 %26 = getelementptr float* %arg5, i64 %25 %27 = insertelement <4 x float*> %24, float* %26, i32 1 %28 = extractelement <4 x i64> %21, i32 2 %29 = getelementptr float* %arg5, i64 %28 %30 = insertelement <4 x float*> %27, float* %29, i32 2 %31 = extractelement <4 x i64> %21, i32 3 %32 = getelementptr float* %arg5, i64 %31 %33 = insertelement <4 x float*> %30, float* %32, i32 3 %34 = extractelement <4 x i64> %21, i32 0 %35 = getelementptr float* %arg5, i64 %34 %36 = getelementptr float* %35, i32 0 %37 = bitcast float* %36 to <4 x float>* %wide.load = load <4 x float>* %37 %38 = extractelement <4 x i64> %21, i32 0 %39 = getelementptr float* %arg6, i64 %38 %40 = insertelement <4 x float*> undef, float* %39, i32 0 %41 = extractelement <4 x i64> %21, i32 1 %42 = getelementptr float* %arg6, i64 %41 %43 = insertelement <4 x float*> %40, float* %42, i32 1 %44 = extractelement <4 x i64> %21, i32 2 %45 = getelementptr float* %arg6, i64 %44 %46 = insertelement <4 x float*> %43, float* %45, i32 2 %47 = extractelement <4 x i64> %21, i32 3 %48 = getelementptr float* %arg6, i64 %47 %49 = insertelement <4 x float*> %46, float* %48, i32 3 %50 = extractelement <4 x i64> %21, i32 0 %51 = getelementptr float* %arg6, i64 %50 %52 = getelementptr float* %51, i32 0 %53 = bitcast float* %52 to <4 x float>* %wide.load3 = load <4 x float>* %53 %54 = fadd <4 x float> %wide.load3, %wide.load %55 = extractelement <4 x i64> %21, i32 0 %56 = getelementptr float* %arg4, i64 %55 %57 = insertelement <4 x float*> undef, float* %56, i32 0 %58 = extractelement <4 x i64> %21, i32 1 %59 = getelementptr float* %arg4, i64 %58 %60 = insertelement <4 x float*> %57, float* %59, i32 1 %61 = extractelement <4 x i64> %21, i32 2 %62 = getelementptr float* %arg4, i64 %61 %63 = insertelement <4 x float*> %60, float* %62, i32 2 %64 = extractelement <4 x i64> %21, i32 3 %65 = getelementptr float* %arg4, i64 %64 %66 = insertelement <4 x float*> %63, float* %65, i32 3 %67 = extractelement <4 x i64> %21, i32 0 %68 = getelementptr float* %arg4, i64 %67 %69 = getelementptr float* %68, i32 0 %70 = bitcast float* %69 to <4 x float>* store <4 x float> %54, <4 x float>* %70 %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1> %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4> %index.next = add i64 %index, 4 %73 = icmp eq i64 %index.next, 4 br i1 %73, label %middle.block, label %vector.body middle.block: ; preds = %vector.body, %L5 %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] %cmp.n = icmp eq i64 4, %resume.val br i1 %cmp.n, label %L4, label %scalar.ph scalar.ph: ; preds = %middle.block br label %L3 L6: ; preds = %L4 ret void } Why did the loop vectorizer produced so many insertelement and extractelement instructions? I don't remember that those instructions entered when vectorizing other loops. Is this harmless? Which pass can clean up these instructions? Frank
Arnold Schwaighofer
2013-Nov-06 16:30 UTC
[LLVMdev] loop vectorizer: Unexpected extract/insertelement
The loop vectorizer relies on cleanup passes to be run after it: from Transforms/IPO/PassManagerBuilder.cpp: // Add the various vectorization passes and relevant cleanup passes for // them since we are no longer in the middle of the main scalar pipeline. MPM.add(createLoopVectorizePass(DisableUnrollLoops)); MPM.add(createInstructionCombiningPass()); MPM.add(createCFGSimplificationPass()); On Nov 6, 2013, at 8:15 AM, Frank Winter <fwinter at jlab.org> wrote:> The following IR implements the following nested loop: > > for (int i = start ; i < end ; ++i ) > for (int p = 0 ; p < 4 ; ++p ) > a[i*4+p] = b[i*4+p] + c[i*4+p]; > > > > > define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { > entrypoint: > br i1 %arg2, label %L0, label %L1 > > L0: ; preds = %entrypoint > %0 = add nsw i64 %arg0, %arg3 > %1 = add nsw i64 %arg1, %arg3 > br label %L2 > > L1: ; preds = %entrypoint > br label %L2 > > L2: ; preds = %L0, %L1 > %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] > %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] > %4 = sdiv i64 %2, 4 > %5 = sdiv i64 %3, 4 > br label %L5 > > L3: ; preds = %L3, %L5 > %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ] > %7 = mul i64 %19, 4 > %8 = add nsw i64 %7, %6 > %9 = getelementptr float* %arg5, i64 %8 > %10 = load float* %9 > %11 = getelementptr float* %arg6, i64 %8 > %12 = load float* %11 > %13 = fadd float %12, %10 > %14 = getelementptr float* %arg4, i64 %8 > store float %13, float* %14 > %15 = add nsw i64 %6, 1 > %16 = icmp sge i64 %15, 4 > br i1 %16, label %L4, label %L3 > > L4: ; preds = %L3 > %17 = add nsw i64 %19, 1 > %18 = icmp sge i64 %17, %5 > br i1 %18, label %L6, label %L5 > > L5: ; preds = %L4, %L2 > %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] > br label %L3 > > L6: ; preds = %L4 > ret void > } > > > L3 is the inner loop with constant trip count 4. > > When calling the loop vectorizer, > > opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4 loop4.ll -S > > LV: > Checking a loop in "main" > LV: Found a loop: L3 > LV: Found an induction variable. > LV: We need to do 0 pointer comparisons. > LV: We don't need a runtime memory check. > LV: We can vectorize this loop! > LV: Found trip count: 4 > LV: The Widest type: 32 bits. > LV: The Widest register is: 128 bits. > .... > LV: Vector loop of width 4 costs: 7. > LV: Selecting VF = : 4. > LV: Found a vectorizable loop (4) in loop4.ll > LV: Unroll Factor is 1 > > we can see that the loop was vectorized. However, the code that is produced doesn't look too good: > > > define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { > entrypoint: > br i1 %arg2, label %L0, label %L1 > > L0: ; preds = %entrypoint > %0 = add nsw i64 %arg0, %arg3 > %1 = add nsw i64 %arg1, %arg3 > br label %L2 > > L1: ; preds = %entrypoint > br label %L2 > > L2: ; preds = %L1, %L0 > %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] > %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] > %4 = sdiv i64 %2, 4 > %5 = sdiv i64 %3, 4 > br label %L5 > > L3: ; preds = %scalar.ph, %L3 > %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ] > %7 = mul i64 %19, 4 > %8 = add nsw i64 %7, %6 > %9 = getelementptr float* %arg5, i64 %8 > %10 = load float* %9 > %11 = getelementptr float* %arg6, i64 %8 > %12 = load float* %11 > %13 = fadd float %12, %10 > %14 = getelementptr float* %arg4, i64 %8 > store float %13, float* %14 > %15 = add nsw i64 %6, 1 > %16 = icmp sge i64 %15, 4 > br i1 %16, label %L4, label %L3, !llvm.loop !0 > > L4: ; preds = %middle.block, %L3 > %17 = add nsw i64 %19, 1 > %18 = icmp sge i64 %17, %5 > br i1 %18, label %L6, label %L5 > > L5: ; preds = %L4, %L2 > %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] > br i1 false, label %middle.block, label %vector.ph > > vector.ph: ; preds = %L5 > %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 > %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer > br label %vector.body > > vector.body: ; preds = %vector.body, %vector.ph > %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] > %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 > %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer > %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> > %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64 4> > %21 = add nsw <4 x i64> %20, %induction > %22 = extractelement <4 x i64> %21, i32 0 > %23 = getelementptr float* %arg5, i64 %22 > %24 = insertelement <4 x float*> undef, float* %23, i32 0 > %25 = extractelement <4 x i64> %21, i32 1 > %26 = getelementptr float* %arg5, i64 %25 > %27 = insertelement <4 x float*> %24, float* %26, i32 1 > %28 = extractelement <4 x i64> %21, i32 2 > %29 = getelementptr float* %arg5, i64 %28 > %30 = insertelement <4 x float*> %27, float* %29, i32 2 > %31 = extractelement <4 x i64> %21, i32 3 > %32 = getelementptr float* %arg5, i64 %31 > %33 = insertelement <4 x float*> %30, float* %32, i32 3 > %34 = extractelement <4 x i64> %21, i32 0 > %35 = getelementptr float* %arg5, i64 %34 > %36 = getelementptr float* %35, i32 0 > %37 = bitcast float* %36 to <4 x float>* > %wide.load = load <4 x float>* %37 > %38 = extractelement <4 x i64> %21, i32 0 > %39 = getelementptr float* %arg6, i64 %38 > %40 = insertelement <4 x float*> undef, float* %39, i32 0 > %41 = extractelement <4 x i64> %21, i32 1 > %42 = getelementptr float* %arg6, i64 %41 > %43 = insertelement <4 x float*> %40, float* %42, i32 1 > %44 = extractelement <4 x i64> %21, i32 2 > %45 = getelementptr float* %arg6, i64 %44 > %46 = insertelement <4 x float*> %43, float* %45, i32 2 > %47 = extractelement <4 x i64> %21, i32 3 > %48 = getelementptr float* %arg6, i64 %47 > %49 = insertelement <4 x float*> %46, float* %48, i32 3 > %50 = extractelement <4 x i64> %21, i32 0 > %51 = getelementptr float* %arg6, i64 %50 > %52 = getelementptr float* %51, i32 0 > %53 = bitcast float* %52 to <4 x float>* > %wide.load3 = load <4 x float>* %53 > %54 = fadd <4 x float> %wide.load3, %wide.load > %55 = extractelement <4 x i64> %21, i32 0 > %56 = getelementptr float* %arg4, i64 %55 > %57 = insertelement <4 x float*> undef, float* %56, i32 0 > %58 = extractelement <4 x i64> %21, i32 1 > %59 = getelementptr float* %arg4, i64 %58 > %60 = insertelement <4 x float*> %57, float* %59, i32 1 > %61 = extractelement <4 x i64> %21, i32 2 > %62 = getelementptr float* %arg4, i64 %61 > %63 = insertelement <4 x float*> %60, float* %62, i32 2 > %64 = extractelement <4 x i64> %21, i32 3 > %65 = getelementptr float* %arg4, i64 %64 > %66 = insertelement <4 x float*> %63, float* %65, i32 3 > %67 = extractelement <4 x i64> %21, i32 0 > %68 = getelementptr float* %arg4, i64 %67 > %69 = getelementptr float* %68, i32 0 > %70 = bitcast float* %69 to <4 x float>* > store <4 x float> %54, <4 x float>* %70 > %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1> > %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4> > %index.next = add i64 %index, 4 > %73 = icmp eq i64 %index.next, 4 > br i1 %73, label %middle.block, label %vector.body > > middle.block: ; preds = %vector.body, %L5 > %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] > %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] > %cmp.n = icmp eq i64 4, %resume.val > br i1 %cmp.n, label %L4, label %scalar.ph > > scalar.ph: ; preds = %middle.block > br label %L3 > > L6: ; preds = %L4 > ret void > } > > > > Why did the loop vectorizer produced so many insertelement and extractelement instructions? > > I don't remember that those instructions entered when vectorizing other loops. Is this harmless? Which pass can clean up these instructions? > > Frank > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Frank Winter
2013-Nov-06 16:39 UTC
[LLVMdev] loop vectorizer: Unexpected extract/insertelement
The instcombine pass cleans up a lot. Any idea why there are still shufflevector, insertelement, *and* bitcast (!!) etc. instructions left? The original loop is so clean, a textbook example I'd say. There is no need to shuffle anything.At least I don't see it. Frank vector.ph: ; preds = %L5 %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> %20 = shl <4 x i64> %broadcast.splat2, <i64 2, i64 2, i64 2, i64 2> %21 = add nsw <4 x i64> %20, %induction %22 = extractelement <4 x i64> %21, i32 0 %23 = getelementptr float* %arg5, i64 %22 %24 = bitcast float* %23 to <4 x float>* %wide.load = load <4 x float>* %24, align 16 %25 = extractelement <4 x i64> %21, i32 0 %26 = getelementptr float* %arg6, i64 %25 %27 = bitcast float* %26 to <4 x float>* %wide.load3 = load <4 x float>* %27, align 16 %28 = fadd <4 x float> %wide.load3, %wide.load %29 = extractelement <4 x i64> %21, i32 0 %30 = getelementptr float* %arg4, i64 %29 %31 = bitcast float* %30 to <4 x float>* store <4 x float> %28, <4 x float>* %31, align 16 %index.next = add i64 %index, 4 %32 = icmp eq i64 %index, 0 br i1 %32, label %middle.block, label %vector.body middle.block: ; preds = %vector.body, %L5 %resume.val = phi i1 [ false, %L5 ], [ true, %vector.body ] %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] br i1 %resume.val, label %L4, label %scalar.ph scalar.ph: ; preds = %middle.block br label %L3 On 06/11/13 11:30, Arnold Schwaighofer wrote:> The loop vectorizer relies on cleanup passes to be run after it: > > from Transforms/IPO/PassManagerBuilder.cpp: > > // Add the various vectorization passes and relevant cleanup passes for > // them since we are no longer in the middle of the main scalar pipeline. > MPM.add(createLoopVectorizePass(DisableUnrollLoops)); > MPM.add(createInstructionCombiningPass()); > MPM.add(createCFGSimplificationPass()); > > > On Nov 6, 2013, at 8:15 AM, Frank Winter <fwinter at jlab.org> wrote: > >> The following IR implements the following nested loop: >> >> for (int i = start ; i < end ; ++i ) >> for (int p = 0 ; p < 4 ; ++p ) >> a[i*4+p] = b[i*4+p] + c[i*4+p]; >> >> >> >> >> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { >> entrypoint: >> br i1 %arg2, label %L0, label %L1 >> >> L0: ; preds = %entrypoint >> %0 = add nsw i64 %arg0, %arg3 >> %1 = add nsw i64 %arg1, %arg3 >> br label %L2 >> >> L1: ; preds = %entrypoint >> br label %L2 >> >> L2: ; preds = %L0, %L1 >> %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] >> %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] >> %4 = sdiv i64 %2, 4 >> %5 = sdiv i64 %3, 4 >> br label %L5 >> >> L3: ; preds = %L3, %L5 >> %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ] >> %7 = mul i64 %19, 4 >> %8 = add nsw i64 %7, %6 >> %9 = getelementptr float* %arg5, i64 %8 >> %10 = load float* %9 >> %11 = getelementptr float* %arg6, i64 %8 >> %12 = load float* %11 >> %13 = fadd float %12, %10 >> %14 = getelementptr float* %arg4, i64 %8 >> store float %13, float* %14 >> %15 = add nsw i64 %6, 1 >> %16 = icmp sge i64 %15, 4 >> br i1 %16, label %L4, label %L3 >> >> L4: ; preds = %L3 >> %17 = add nsw i64 %19, 1 >> %18 = icmp sge i64 %17, %5 >> br i1 %18, label %L6, label %L5 >> >> L5: ; preds = %L4, %L2 >> %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] >> br label %L3 >> >> L6: ; preds = %L4 >> ret void >> } >> >> >> L3 is the inner loop with constant trip count 4. >> >> When calling the loop vectorizer, >> >> opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4 loop4.ll -S >> >> LV: >> Checking a loop in "main" >> LV: Found a loop: L3 >> LV: Found an induction variable. >> LV: We need to do 0 pointer comparisons. >> LV: We don't need a runtime memory check. >> LV: We can vectorize this loop! >> LV: Found trip count: 4 >> LV: The Widest type: 32 bits. >> LV: The Widest register is: 128 bits. >> .... >> LV: Vector loop of width 4 costs: 7. >> LV: Selecting VF = : 4. >> LV: Found a vectorizable loop (4) in loop4.ll >> LV: Unroll Factor is 1 >> >> we can see that the loop was vectorized. However, the code that is produced doesn't look too good: >> >> >> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { >> entrypoint: >> br i1 %arg2, label %L0, label %L1 >> >> L0: ; preds = %entrypoint >> %0 = add nsw i64 %arg0, %arg3 >> %1 = add nsw i64 %arg1, %arg3 >> br label %L2 >> >> L1: ; preds = %entrypoint >> br label %L2 >> >> L2: ; preds = %L1, %L0 >> %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] >> %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] >> %4 = sdiv i64 %2, 4 >> %5 = sdiv i64 %3, 4 >> br label %L5 >> >> L3: ; preds = %scalar.ph, %L3 >> %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ] >> %7 = mul i64 %19, 4 >> %8 = add nsw i64 %7, %6 >> %9 = getelementptr float* %arg5, i64 %8 >> %10 = load float* %9 >> %11 = getelementptr float* %arg6, i64 %8 >> %12 = load float* %11 >> %13 = fadd float %12, %10 >> %14 = getelementptr float* %arg4, i64 %8 >> store float %13, float* %14 >> %15 = add nsw i64 %6, 1 >> %16 = icmp sge i64 %15, 4 >> br i1 %16, label %L4, label %L3, !llvm.loop !0 >> >> L4: ; preds = %middle.block, %L3 >> %17 = add nsw i64 %19, 1 >> %18 = icmp sge i64 %17, %5 >> br i1 %18, label %L6, label %L5 >> >> L5: ; preds = %L4, %L2 >> %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] >> br i1 false, label %middle.block, label %vector.ph >> >> vector.ph: ; preds = %L5 >> %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 >> %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer >> br label %vector.body >> >> vector.body: ; preds = %vector.body, %vector.ph >> %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] >> %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 >> %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer >> %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> >> %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64 4> >> %21 = add nsw <4 x i64> %20, %induction >> %22 = extractelement <4 x i64> %21, i32 0 >> %23 = getelementptr float* %arg5, i64 %22 >> %24 = insertelement <4 x float*> undef, float* %23, i32 0 >> %25 = extractelement <4 x i64> %21, i32 1 >> %26 = getelementptr float* %arg5, i64 %25 >> %27 = insertelement <4 x float*> %24, float* %26, i32 1 >> %28 = extractelement <4 x i64> %21, i32 2 >> %29 = getelementptr float* %arg5, i64 %28 >> %30 = insertelement <4 x float*> %27, float* %29, i32 2 >> %31 = extractelement <4 x i64> %21, i32 3 >> %32 = getelementptr float* %arg5, i64 %31 >> %33 = insertelement <4 x float*> %30, float* %32, i32 3 >> %34 = extractelement <4 x i64> %21, i32 0 >> %35 = getelementptr float* %arg5, i64 %34 >> %36 = getelementptr float* %35, i32 0 >> %37 = bitcast float* %36 to <4 x float>* >> %wide.load = load <4 x float>* %37 >> %38 = extractelement <4 x i64> %21, i32 0 >> %39 = getelementptr float* %arg6, i64 %38 >> %40 = insertelement <4 x float*> undef, float* %39, i32 0 >> %41 = extractelement <4 x i64> %21, i32 1 >> %42 = getelementptr float* %arg6, i64 %41 >> %43 = insertelement <4 x float*> %40, float* %42, i32 1 >> %44 = extractelement <4 x i64> %21, i32 2 >> %45 = getelementptr float* %arg6, i64 %44 >> %46 = insertelement <4 x float*> %43, float* %45, i32 2 >> %47 = extractelement <4 x i64> %21, i32 3 >> %48 = getelementptr float* %arg6, i64 %47 >> %49 = insertelement <4 x float*> %46, float* %48, i32 3 >> %50 = extractelement <4 x i64> %21, i32 0 >> %51 = getelementptr float* %arg6, i64 %50 >> %52 = getelementptr float* %51, i32 0 >> %53 = bitcast float* %52 to <4 x float>* >> %wide.load3 = load <4 x float>* %53 >> %54 = fadd <4 x float> %wide.load3, %wide.load >> %55 = extractelement <4 x i64> %21, i32 0 >> %56 = getelementptr float* %arg4, i64 %55 >> %57 = insertelement <4 x float*> undef, float* %56, i32 0 >> %58 = extractelement <4 x i64> %21, i32 1 >> %59 = getelementptr float* %arg4, i64 %58 >> %60 = insertelement <4 x float*> %57, float* %59, i32 1 >> %61 = extractelement <4 x i64> %21, i32 2 >> %62 = getelementptr float* %arg4, i64 %61 >> %63 = insertelement <4 x float*> %60, float* %62, i32 2 >> %64 = extractelement <4 x i64> %21, i32 3 >> %65 = getelementptr float* %arg4, i64 %64 >> %66 = insertelement <4 x float*> %63, float* %65, i32 3 >> %67 = extractelement <4 x i64> %21, i32 0 >> %68 = getelementptr float* %arg4, i64 %67 >> %69 = getelementptr float* %68, i32 0 >> %70 = bitcast float* %69 to <4 x float>* >> store <4 x float> %54, <4 x float>* %70 >> %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1> >> %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4> >> %index.next = add i64 %index, 4 >> %73 = icmp eq i64 %index.next, 4 >> br i1 %73, label %middle.block, label %vector.body >> >> middle.block: ; preds = %vector.body, %L5 >> %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] >> %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] >> %cmp.n = icmp eq i64 4, %resume.val >> br i1 %cmp.n, label %L4, label %scalar.ph >> >> scalar.ph: ; preds = %middle.block >> br label %L3 >> >> L6: ; preds = %L4 >> ret void >> } >> >> >> >> Why did the loop vectorizer produced so many insertelement and extractelement instructions? >> >> I don't remember that those instructions entered when vectorizing other loops. Is this harmless? Which pass can clean up these instructions? >> >> Frank >> >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Possibly Parallel Threads
- [LLVMdev] loop vectorizer: Unexpected extract/insertelement
- [LLVMdev] loop vectorizer: Unexpected extract/insertelement
- [LLVMdev] loop vectorizer: Unexpected extract/insertelement
- [LLVMdev] loop vectorizer: this loop is not worth vectorizing
- [LLVMdev] loop vectorizer: this loop is not worth vectorizing