Frank Winter
2013-Nov-06 16:39 UTC
[LLVMdev] loop vectorizer: Unexpected extract/insertelement
The instcombine pass cleans up a lot. Any idea why there are still shufflevector, insertelement, *and* bitcast (!!) etc. instructions left? The original loop is so clean, a textbook example I'd say. There is no need to shuffle anything.At least I don't see it. Frank vector.ph: ; preds = %L5 %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> %20 = shl <4 x i64> %broadcast.splat2, <i64 2, i64 2, i64 2, i64 2> %21 = add nsw <4 x i64> %20, %induction %22 = extractelement <4 x i64> %21, i32 0 %23 = getelementptr float* %arg5, i64 %22 %24 = bitcast float* %23 to <4 x float>* %wide.load = load <4 x float>* %24, align 16 %25 = extractelement <4 x i64> %21, i32 0 %26 = getelementptr float* %arg6, i64 %25 %27 = bitcast float* %26 to <4 x float>* %wide.load3 = load <4 x float>* %27, align 16 %28 = fadd <4 x float> %wide.load3, %wide.load %29 = extractelement <4 x i64> %21, i32 0 %30 = getelementptr float* %arg4, i64 %29 %31 = bitcast float* %30 to <4 x float>* store <4 x float> %28, <4 x float>* %31, align 16 %index.next = add i64 %index, 4 %32 = icmp eq i64 %index, 0 br i1 %32, label %middle.block, label %vector.body middle.block: ; preds = %vector.body, %L5 %resume.val = phi i1 [ false, %L5 ], [ true, %vector.body ] %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] br i1 %resume.val, label %L4, label %scalar.ph scalar.ph: ; preds = %middle.block br label %L3 On 06/11/13 11:30, Arnold Schwaighofer wrote:> The loop vectorizer relies on cleanup passes to be run after it: > > from Transforms/IPO/PassManagerBuilder.cpp: > > // Add the various vectorization passes and relevant cleanup passes for > // them since we are no longer in the middle of the main scalar pipeline. > MPM.add(createLoopVectorizePass(DisableUnrollLoops)); > MPM.add(createInstructionCombiningPass()); > MPM.add(createCFGSimplificationPass()); > > > On Nov 6, 2013, at 8:15 AM, Frank Winter <fwinter at jlab.org> wrote: > >> The following IR implements the following nested loop: >> >> for (int i = start ; i < end ; ++i ) >> for (int p = 0 ; p < 4 ; ++p ) >> a[i*4+p] = b[i*4+p] + c[i*4+p]; >> >> >> >> >> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { >> entrypoint: >> br i1 %arg2, label %L0, label %L1 >> >> L0: ; preds = %entrypoint >> %0 = add nsw i64 %arg0, %arg3 >> %1 = add nsw i64 %arg1, %arg3 >> br label %L2 >> >> L1: ; preds = %entrypoint >> br label %L2 >> >> L2: ; preds = %L0, %L1 >> %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] >> %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] >> %4 = sdiv i64 %2, 4 >> %5 = sdiv i64 %3, 4 >> br label %L5 >> >> L3: ; preds = %L3, %L5 >> %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ] >> %7 = mul i64 %19, 4 >> %8 = add nsw i64 %7, %6 >> %9 = getelementptr float* %arg5, i64 %8 >> %10 = load float* %9 >> %11 = getelementptr float* %arg6, i64 %8 >> %12 = load float* %11 >> %13 = fadd float %12, %10 >> %14 = getelementptr float* %arg4, i64 %8 >> store float %13, float* %14 >> %15 = add nsw i64 %6, 1 >> %16 = icmp sge i64 %15, 4 >> br i1 %16, label %L4, label %L3 >> >> L4: ; preds = %L3 >> %17 = add nsw i64 %19, 1 >> %18 = icmp sge i64 %17, %5 >> br i1 %18, label %L6, label %L5 >> >> L5: ; preds = %L4, %L2 >> %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] >> br label %L3 >> >> L6: ; preds = %L4 >> ret void >> } >> >> >> L3 is the inner loop with constant trip count 4. >> >> When calling the loop vectorizer, >> >> opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4 loop4.ll -S >> >> LV: >> Checking a loop in "main" >> LV: Found a loop: L3 >> LV: Found an induction variable. >> LV: We need to do 0 pointer comparisons. >> LV: We don't need a runtime memory check. >> LV: We can vectorize this loop! >> LV: Found trip count: 4 >> LV: The Widest type: 32 bits. >> LV: The Widest register is: 128 bits. >> .... >> LV: Vector loop of width 4 costs: 7. >> LV: Selecting VF = : 4. >> LV: Found a vectorizable loop (4) in loop4.ll >> LV: Unroll Factor is 1 >> >> we can see that the loop was vectorized. However, the code that is produced doesn't look too good: >> >> >> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { >> entrypoint: >> br i1 %arg2, label %L0, label %L1 >> >> L0: ; preds = %entrypoint >> %0 = add nsw i64 %arg0, %arg3 >> %1 = add nsw i64 %arg1, %arg3 >> br label %L2 >> >> L1: ; preds = %entrypoint >> br label %L2 >> >> L2: ; preds = %L1, %L0 >> %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] >> %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] >> %4 = sdiv i64 %2, 4 >> %5 = sdiv i64 %3, 4 >> br label %L5 >> >> L3: ; preds = %scalar.ph, %L3 >> %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ] >> %7 = mul i64 %19, 4 >> %8 = add nsw i64 %7, %6 >> %9 = getelementptr float* %arg5, i64 %8 >> %10 = load float* %9 >> %11 = getelementptr float* %arg6, i64 %8 >> %12 = load float* %11 >> %13 = fadd float %12, %10 >> %14 = getelementptr float* %arg4, i64 %8 >> store float %13, float* %14 >> %15 = add nsw i64 %6, 1 >> %16 = icmp sge i64 %15, 4 >> br i1 %16, label %L4, label %L3, !llvm.loop !0 >> >> L4: ; preds = %middle.block, %L3 >> %17 = add nsw i64 %19, 1 >> %18 = icmp sge i64 %17, %5 >> br i1 %18, label %L6, label %L5 >> >> L5: ; preds = %L4, %L2 >> %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] >> br i1 false, label %middle.block, label %vector.ph >> >> vector.ph: ; preds = %L5 >> %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 >> %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer >> br label %vector.body >> >> vector.body: ; preds = %vector.body, %vector.ph >> %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] >> %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 >> %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer >> %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> >> %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64 4> >> %21 = add nsw <4 x i64> %20, %induction >> %22 = extractelement <4 x i64> %21, i32 0 >> %23 = getelementptr float* %arg5, i64 %22 >> %24 = insertelement <4 x float*> undef, float* %23, i32 0 >> %25 = extractelement <4 x i64> %21, i32 1 >> %26 = getelementptr float* %arg5, i64 %25 >> %27 = insertelement <4 x float*> %24, float* %26, i32 1 >> %28 = extractelement <4 x i64> %21, i32 2 >> %29 = getelementptr float* %arg5, i64 %28 >> %30 = insertelement <4 x float*> %27, float* %29, i32 2 >> %31 = extractelement <4 x i64> %21, i32 3 >> %32 = getelementptr float* %arg5, i64 %31 >> %33 = insertelement <4 x float*> %30, float* %32, i32 3 >> %34 = extractelement <4 x i64> %21, i32 0 >> %35 = getelementptr float* %arg5, i64 %34 >> %36 = getelementptr float* %35, i32 0 >> %37 = bitcast float* %36 to <4 x float>* >> %wide.load = load <4 x float>* %37 >> %38 = extractelement <4 x i64> %21, i32 0 >> %39 = getelementptr float* %arg6, i64 %38 >> %40 = insertelement <4 x float*> undef, float* %39, i32 0 >> %41 = extractelement <4 x i64> %21, i32 1 >> %42 = getelementptr float* %arg6, i64 %41 >> %43 = insertelement <4 x float*> %40, float* %42, i32 1 >> %44 = extractelement <4 x i64> %21, i32 2 >> %45 = getelementptr float* %arg6, i64 %44 >> %46 = insertelement <4 x float*> %43, float* %45, i32 2 >> %47 = extractelement <4 x i64> %21, i32 3 >> %48 = getelementptr float* %arg6, i64 %47 >> %49 = insertelement <4 x float*> %46, float* %48, i32 3 >> %50 = extractelement <4 x i64> %21, i32 0 >> %51 = getelementptr float* %arg6, i64 %50 >> %52 = getelementptr float* %51, i32 0 >> %53 = bitcast float* %52 to <4 x float>* >> %wide.load3 = load <4 x float>* %53 >> %54 = fadd <4 x float> %wide.load3, %wide.load >> %55 = extractelement <4 x i64> %21, i32 0 >> %56 = getelementptr float* %arg4, i64 %55 >> %57 = insertelement <4 x float*> undef, float* %56, i32 0 >> %58 = extractelement <4 x i64> %21, i32 1 >> %59 = getelementptr float* %arg4, i64 %58 >> %60 = insertelement <4 x float*> %57, float* %59, i32 1 >> %61 = extractelement <4 x i64> %21, i32 2 >> %62 = getelementptr float* %arg4, i64 %61 >> %63 = insertelement <4 x float*> %60, float* %62, i32 2 >> %64 = extractelement <4 x i64> %21, i32 3 >> %65 = getelementptr float* %arg4, i64 %64 >> %66 = insertelement <4 x float*> %63, float* %65, i32 3 >> %67 = extractelement <4 x i64> %21, i32 0 >> %68 = getelementptr float* %arg4, i64 %67 >> %69 = getelementptr float* %68, i32 0 >> %70 = bitcast float* %69 to <4 x float>* >> store <4 x float> %54, <4 x float>* %70 >> %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1> >> %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4> >> %index.next = add i64 %index, 4 >> %73 = icmp eq i64 %index.next, 4 >> br i1 %73, label %middle.block, label %vector.body >> >> middle.block: ; preds = %vector.body, %L5 >> %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] >> %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] >> %cmp.n = icmp eq i64 4, %resume.val >> br i1 %cmp.n, label %L4, label %scalar.ph >> >> scalar.ph: ; preds = %middle.block >> br label %L3 >> >> L6: ; preds = %L4 >> ret void >> } >> >> >> >> Why did the loop vectorizer produced so many insertelement and extractelement instructions? >> >> I don't remember that those instructions entered when vectorizing other loops. Is this harmless? Which pass can clean up these instructions? >> >> Frank >> >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Arnold Schwaighofer
2013-Nov-06 16:47 UTC
[LLVMdev] loop vectorizer: Unexpected extract/insertelement
Yes, you need the latest ToT version of llvm or you run -loop-vectorize -earlycse -instcombine -simplifycfg The bitcast essentially is a noop to satisfy the type system. This is how your example looks like for me: vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %.lhs = shl i64 %6, 2 %7 = add i64 %.lhs, %index %8 = getelementptr float* %arg5, i64 %7 %9 = bitcast float* %8 to <4 x float>* %wide.load = load <4 x float>* %9, align 16 %10 = getelementptr float* %arg6, i64 %7 %11 = bitcast float* %10 to <4 x float>* %wide.load3 = load <4 x float>* %11, align 16 %12 = fadd <4 x float> %wide.load3, %wide.load %13 = getelementptr float* %arg4, i64 %7 %14 = bitcast float* %13 to <4 x float>* store <4 x float> %12, <4 x float>* %14, align 16 %index.next = add i64 %index, 4 %15 = icmp eq i64 %index, 0 br i1 %15, label %middle.block, label %vector.body On Nov 6, 2013, at 8:39 AM, Frank Winter <fwinter at jlab.org> wrote:> The instcombine pass cleans up a lot. > > Any idea why there are still shufflevector, insertelement, *and* bitcast (!!) etc. instructions left? The original loop is so clean, a textbook example I'd say. There is no need to shuffle anything.At least I don't see it. > > Frank > > > vector.ph: ; preds = %L5 > %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 > %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer > br label %vector.body > > vector.body: ; preds = %vector.body, %vector.ph > %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] > %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 > %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer > %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> > %20 = shl <4 x i64> %broadcast.splat2, <i64 2, i64 2, i64 2, i64 2> > %21 = add nsw <4 x i64> %20, %induction > %22 = extractelement <4 x i64> %21, i32 0 > %23 = getelementptr float* %arg5, i64 %22 > %24 = bitcast float* %23 to <4 x float>* > %wide.load = load <4 x float>* %24, align 16 > %25 = extractelement <4 x i64> %21, i32 0 > %26 = getelementptr float* %arg6, i64 %25 > %27 = bitcast float* %26 to <4 x float>* > %wide.load3 = load <4 x float>* %27, align 16 > %28 = fadd <4 x float> %wide.load3, %wide.load > %29 = extractelement <4 x i64> %21, i32 0 > %30 = getelementptr float* %arg4, i64 %29 > %31 = bitcast float* %30 to <4 x float>* > store <4 x float> %28, <4 x float>* %31, align 16 > %index.next = add i64 %index, 4 > %32 = icmp eq i64 %index, 0 > br i1 %32, label %middle.block, label %vector.body > > middle.block: ; preds = %vector.body, %L5 > %resume.val = phi i1 [ false, %L5 ], [ true, %vector.body ] > %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] > br i1 %resume.val, label %L4, label %scalar.ph > > scalar.ph: ; preds = %middle.block > br label %L3 > > > > > > > On 06/11/13 11:30, Arnold Schwaighofer wrote: >> The loop vectorizer relies on cleanup passes to be run after it: >> >> from Transforms/IPO/PassManagerBuilder.cpp: >> >> // Add the various vectorization passes and relevant cleanup passes for >> // them since we are no longer in the middle of the main scalar pipeline. >> MPM.add(createLoopVectorizePass(DisableUnrollLoops)); >> MPM.add(createInstructionCombiningPass()); >> MPM.add(createCFGSimplificationPass()); >> >> >> On Nov 6, 2013, at 8:15 AM, Frank Winter <fwinter at jlab.org> wrote: >> >>> The following IR implements the following nested loop: >>> >>> for (int i = start ; i < end ; ++i ) >>> for (int p = 0 ; p < 4 ; ++p ) >>> a[i*4+p] = b[i*4+p] + c[i*4+p]; >>> >>> >>> >>> >>> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { >>> entrypoint: >>> br i1 %arg2, label %L0, label %L1 >>> >>> L0: ; preds = %entrypoint >>> %0 = add nsw i64 %arg0, %arg3 >>> %1 = add nsw i64 %arg1, %arg3 >>> br label %L2 >>> >>> L1: ; preds = %entrypoint >>> br label %L2 >>> >>> L2: ; preds = %L0, %L1 >>> %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] >>> %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] >>> %4 = sdiv i64 %2, 4 >>> %5 = sdiv i64 %3, 4 >>> br label %L5 >>> >>> L3: ; preds = %L3, %L5 >>> %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ] >>> %7 = mul i64 %19, 4 >>> %8 = add nsw i64 %7, %6 >>> %9 = getelementptr float* %arg5, i64 %8 >>> %10 = load float* %9 >>> %11 = getelementptr float* %arg6, i64 %8 >>> %12 = load float* %11 >>> %13 = fadd float %12, %10 >>> %14 = getelementptr float* %arg4, i64 %8 >>> store float %13, float* %14 >>> %15 = add nsw i64 %6, 1 >>> %16 = icmp sge i64 %15, 4 >>> br i1 %16, label %L4, label %L3 >>> >>> L4: ; preds = %L3 >>> %17 = add nsw i64 %19, 1 >>> %18 = icmp sge i64 %17, %5 >>> br i1 %18, label %L6, label %L5 >>> >>> L5: ; preds = %L4, %L2 >>> %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] >>> br label %L3 >>> >>> L6: ; preds = %L4 >>> ret void >>> } >>> >>> >>> L3 is the inner loop with constant trip count 4. >>> >>> When calling the loop vectorizer, >>> >>> opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4 loop4.ll -S >>> >>> LV: >>> Checking a loop in "main" >>> LV: Found a loop: L3 >>> LV: Found an induction variable. >>> LV: We need to do 0 pointer comparisons. >>> LV: We don't need a runtime memory check. >>> LV: We can vectorize this loop! >>> LV: Found trip count: 4 >>> LV: The Widest type: 32 bits. >>> LV: The Widest register is: 128 bits. >>> .... >>> LV: Vector loop of width 4 costs: 7. >>> LV: Selecting VF = : 4. >>> LV: Found a vectorizable loop (4) in loop4.ll >>> LV: Unroll Factor is 1 >>> >>> we can see that the loop was vectorized. However, the code that is produced doesn't look too good: >>> >>> >>> define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) { >>> entrypoint: >>> br i1 %arg2, label %L0, label %L1 >>> >>> L0: ; preds = %entrypoint >>> %0 = add nsw i64 %arg0, %arg3 >>> %1 = add nsw i64 %arg1, %arg3 >>> br label %L2 >>> >>> L1: ; preds = %entrypoint >>> br label %L2 >>> >>> L2: ; preds = %L1, %L0 >>> %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ] >>> %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ] >>> %4 = sdiv i64 %2, 4 >>> %5 = sdiv i64 %3, 4 >>> br label %L5 >>> >>> L3: ; preds = %scalar.ph, %L3 >>> %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ] >>> %7 = mul i64 %19, 4 >>> %8 = add nsw i64 %7, %6 >>> %9 = getelementptr float* %arg5, i64 %8 >>> %10 = load float* %9 >>> %11 = getelementptr float* %arg6, i64 %8 >>> %12 = load float* %11 >>> %13 = fadd float %12, %10 >>> %14 = getelementptr float* %arg4, i64 %8 >>> store float %13, float* %14 >>> %15 = add nsw i64 %6, 1 >>> %16 = icmp sge i64 %15, 4 >>> br i1 %16, label %L4, label %L3, !llvm.loop !0 >>> >>> L4: ; preds = %middle.block, %L3 >>> %17 = add nsw i64 %19, 1 >>> %18 = icmp sge i64 %17, %5 >>> br i1 %18, label %L6, label %L5 >>> >>> L5: ; preds = %L4, %L2 >>> %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ] >>> br i1 false, label %middle.block, label %vector.ph >>> >>> vector.ph: ; preds = %L5 >>> %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0 >>> %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer >>> br label %vector.body >>> >>> vector.body: ; preds = %vector.body, %vector.ph >>> %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] >>> %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 >>> %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer >>> %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3> >>> %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64 4> >>> %21 = add nsw <4 x i64> %20, %induction >>> %22 = extractelement <4 x i64> %21, i32 0 >>> %23 = getelementptr float* %arg5, i64 %22 >>> %24 = insertelement <4 x float*> undef, float* %23, i32 0 >>> %25 = extractelement <4 x i64> %21, i32 1 >>> %26 = getelementptr float* %arg5, i64 %25 >>> %27 = insertelement <4 x float*> %24, float* %26, i32 1 >>> %28 = extractelement <4 x i64> %21, i32 2 >>> %29 = getelementptr float* %arg5, i64 %28 >>> %30 = insertelement <4 x float*> %27, float* %29, i32 2 >>> %31 = extractelement <4 x i64> %21, i32 3 >>> %32 = getelementptr float* %arg5, i64 %31 >>> %33 = insertelement <4 x float*> %30, float* %32, i32 3 >>> %34 = extractelement <4 x i64> %21, i32 0 >>> %35 = getelementptr float* %arg5, i64 %34 >>> %36 = getelementptr float* %35, i32 0 >>> %37 = bitcast float* %36 to <4 x float>* >>> %wide.load = load <4 x float>* %37 >>> %38 = extractelement <4 x i64> %21, i32 0 >>> %39 = getelementptr float* %arg6, i64 %38 >>> %40 = insertelement <4 x float*> undef, float* %39, i32 0 >>> %41 = extractelement <4 x i64> %21, i32 1 >>> %42 = getelementptr float* %arg6, i64 %41 >>> %43 = insertelement <4 x float*> %40, float* %42, i32 1 >>> %44 = extractelement <4 x i64> %21, i32 2 >>> %45 = getelementptr float* %arg6, i64 %44 >>> %46 = insertelement <4 x float*> %43, float* %45, i32 2 >>> %47 = extractelement <4 x i64> %21, i32 3 >>> %48 = getelementptr float* %arg6, i64 %47 >>> %49 = insertelement <4 x float*> %46, float* %48, i32 3 >>> %50 = extractelement <4 x i64> %21, i32 0 >>> %51 = getelementptr float* %arg6, i64 %50 >>> %52 = getelementptr float* %51, i32 0 >>> %53 = bitcast float* %52 to <4 x float>* >>> %wide.load3 = load <4 x float>* %53 >>> %54 = fadd <4 x float> %wide.load3, %wide.load >>> %55 = extractelement <4 x i64> %21, i32 0 >>> %56 = getelementptr float* %arg4, i64 %55 >>> %57 = insertelement <4 x float*> undef, float* %56, i32 0 >>> %58 = extractelement <4 x i64> %21, i32 1 >>> %59 = getelementptr float* %arg4, i64 %58 >>> %60 = insertelement <4 x float*> %57, float* %59, i32 1 >>> %61 = extractelement <4 x i64> %21, i32 2 >>> %62 = getelementptr float* %arg4, i64 %61 >>> %63 = insertelement <4 x float*> %60, float* %62, i32 2 >>> %64 = extractelement <4 x i64> %21, i32 3 >>> %65 = getelementptr float* %arg4, i64 %64 >>> %66 = insertelement <4 x float*> %63, float* %65, i32 3 >>> %67 = extractelement <4 x i64> %21, i32 0 >>> %68 = getelementptr float* %arg4, i64 %67 >>> %69 = getelementptr float* %68, i32 0 >>> %70 = bitcast float* %69 to <4 x float>* >>> store <4 x float> %54, <4 x float>* %70 >>> %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1> >>> %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4> >>> %index.next = add i64 %index, 4 >>> %73 = icmp eq i64 %index.next, 4 >>> br i1 %73, label %middle.block, label %vector.body >>> >>> middle.block: ; preds = %vector.body, %L5 >>> %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] >>> %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ] >>> %cmp.n = icmp eq i64 4, %resume.val >>> br i1 %cmp.n, label %L4, label %scalar.ph >>> >>> scalar.ph: ; preds = %middle.block >>> br label %L3 >>> >>> L6: ; preds = %L4 >>> ret void >>> } >>> >>> >>> >>> Why did the loop vectorizer produced so many insertelement and extractelement instructions? >>> >>> I don't remember that those instructions entered when vectorizing other loops. Is this harmless? Which pass can clean up these instructions? >>> >>> Frank >>> >>> _______________________________________________ >>> LLVM Developers mailing list >>> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > >
Renato Golin
2013-Nov-06 19:02 UTC
[LLVMdev] loop vectorizer: Unexpected extract/insertelement
On 6 November 2013 08:47, Arnold Schwaighofer <aschwaighofer at apple.com>wrote:> The bitcast essentially is a noop to satisfy the type system.Even if the subsequent passes don't clean up everything, the back-ends will understand those patterns as specific vector instructions. Actually, if the casts and shuffles are not there, you'll end up with the wrong instruction being selected (or just broken IR). cheers, --renato -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20131106/10794f25/attachment.html>
Apparently Analagous Threads
- [LLVMdev] loop vectorizer: Unexpected extract/insertelement
- [LLVMdev] loop vectorizer: Unexpected extract/insertelement
- [LLVMdev] loop vectorizer: Unexpected extract/insertelement
- [LLVMdev] loop vectorizer: JIT + AVX segfaults
- [LLVMdev] loop vectorizer: this loop is not worth vectorizing