Denis Antrushin via llvm-dev
2019-Oct-24 17:27 UTC
[llvm-dev] LoopVectorizer and nowrap flags
Hi, I ran into a problem which I think is caused by loop vectorizer incorrectly copying nowrap flags from scalar instructions to vector ones. Consider this testcase: ==================================================================; RUN: opt --loop-vectorize --loop-unroll %s define void @test(i32* %B) { entry: br label %outer_loop outer_loop: %local_4 = phi i32 [ 2, %entry ], [ %4, %outer_tail] br label %inner_loop inner_loop: %local_2 = phi i32 [ 0, %outer_loop ], [ %1, %inner_loop ] %local_3 = phi i32 [ -104, %outer_loop ], [ %0, %inner_loop ] ; {-104, -, %local_4} %0 = sub nuw nsw i32 %local_3, %local_4 ; nuw is correct here %1 = add nuw nsw i32 %local_2, 1 %2 = icmp ugt i32 %local_2, 126 br i1 %2, label %outer_tail, label %inner_loop outer_tail: %3 = phi i32 [ %0, %inner_loop ] store atomic i32 %3, i32 * %B unordered, align 8 %4 = add i32 %local_4, 1 %5 = icmp slt i32 %4, 6 br i1 %5, label %outer_loop, label %exit exit: ret void } ================================================================== Note nuw/nsw flags set on '%0 = sub ... ' instruction. They look valid. After vectorization I have: ==================================================================vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ <i32 -104, i32 0, i32 0, i32 0>, %vector.ph ], [ %2, %vector.body ] %vec.phi2 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> %induction1 = add <4 x i32> %broadcast.splat, <i32 4, i32 5, i32 6, i32 7> %0 = add i32 %index, 0 %1 = add i32 %index, 4 %2 = sub nuw nsw <4 x i32> %vec.phi, %broadcast.splat4 %3 = sub nuw nsw <4 x i32> %vec.phi2, %broadcast.splat6 // nuw present but does not seem valid %index.next = add i32 %index, 8 %4 = icmp eq i32 %index.next, 128 br i1 %4, label %middle.block, label %vector.body, !llvm.loop !0 ================================================================== Note that '%3 = sub ...' still has nuw set, but it looks wrong, because it starts from 0 and subtracts positive value And when loop unrolling runs after vectorizer, its thinks that (0 - x)<nuw> is no-op and removes it, leaving only first half: ==================================================================vector.body: ; preds = %vector.ph %0 = sub nuw nsw <4 x i32> <i32 -104, i32 0, i32 0, i32 0>, <i32 2, i32 2, i32 2, i32 2> %1 = sub nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2> %2 = sub nuw nsw <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> %3 = sub nuw nsw <4 x i32> %2, <i32 2, i32 2, i32 2, i32 2> %4 = sub nuw nsw <4 x i32> %3, <i32 2, i32 2, i32 2, i32 2> %5 = sub nuw nsw <4 x i32> %4, <i32 2, i32 2, i32 2, i32 2> %6 = sub nuw nsw <4 x i32> %5, <i32 2, i32 2, i32 2, i32 2> %7 = sub nuw nsw <4 x i32> %6, <i32 2, i32 2, i32 2, i32 2> %8 = sub nuw nsw <4 x i32> %7, <i32 2, i32 2, i32 2, i32 2> %9 = sub nuw nsw <4 x i32> %8, <i32 2, i32 2, i32 2, i32 2> %10 = sub nuw nsw <4 x i32> %9, <i32 2, i32 2, i32 2, i32 2> %11 = sub nuw nsw <4 x i32> %10, <i32 2, i32 2, i32 2, i32 2> %12 = sub nuw nsw <4 x i32> %11, <i32 2, i32 2, i32 2, i32 2> %13 = sub nuw nsw <4 x i32> %12, <i32 2, i32 2, i32 2, i32 2> %14 = sub nuw nsw <4 x i32> %13, <i32 2, i32 2, i32 2, i32 2> %15 = sub nuw nsw <4 x i32> %14, <i32 2, i32 2, i32 2, i32 2> %rdx.shuf = shufflevector <4 x i32> %15, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %bin.rdx7 = add <4 x i32> %15, %rdx.shuf %rdx.shuf8 = shufflevector <4 x i32> %bin.rdx7, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> %bin.rdx9 = add <4 x i32> %bin.rdx7, %rdx.shuf8 %16 = extractelement <4 x i32> %bin.rdx9, i32 0 br i1 true, label %outer_tail, label %scalar.ph ================================================================== What's the proper way to handle this problem? For now I disabled nowrap flag propagation in InnerLoopVectorizer::widenInstruction, but it does not look like a correct fix Thanks, Denis