Josh Klontz
2014-Dec-26 14:27 UTC
[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?
Using LLVM ToT and Hal's helpful slide deck [1], I've been trying to use `llvm.assume` to communicate pointer alignment guarantees to vector load and store instructions. For example, in [2] %5 and %9 are guaranteed to be 32-byte aligned. However, if I run this IR through `opt -O3 -datalayout -S`, the vectorized loads and stores are still 1-byte aligned [3]. What's going wrong? Do I have to move the `llvm.assume` into the loop body? v/r, Josh [1] http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf [2] ; ModuleID = 'align.ll' %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } ; Function Attrs: noduplicate nounwind readonly declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 ; Function Attrs: nounwind declare void @llvm.assume(i1) #1 ; Function Attrs: nounwind define %u8XY* @benchmark(%u8XY*) #1 { entry: %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 %columns = load i32* %1, align 4, !range !0 %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 %rows = load i32* %2, align 4, !range !0 %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32 %rows, i32 1, i8* null) %4 = zext i32 %rows to i64 %dst_y_step = zext i32 %columns to i64 %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 %6 = ptrtoint i8* %5 to i64 %7 = and i64 %6, 31 %8 = icmp eq i64 %7, 0 tail call void @llvm.assume(i1 %8) %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 %10 = ptrtoint i8* %9 to i64 %11 = and i64 %10, 31 %12 = icmp eq i64 %11, 0 tail call void @llvm.assume(i1 %12) %13 = mul nuw nsw i64 %4, %dst_y_step br label %x_body x_body: ; preds = %x_body, %entry %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ] %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1 %.lobit = lshr i8 %15, 7 %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access !1 %y_increment = add nuw nsw i64 %y, 1 %y_postcondition = icmp eq i64 %y_increment, %13 br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2 y_exit: ; preds = %x_body ret %u8XY* %3 } attributes #0 = { noduplicate nounwind readonly } attributes #1 = { nounwind } !0 = !{i32 1, i32 -1} !1 = !{!1} !2 = !{!2} [3] ; ModuleID = 'align.ll' %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } ; Function Attrs: noduplicate nounwind readonly declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 ; Function Attrs: nounwind declare void @llvm.assume(i1) #1 ; Function Attrs: nounwind define %u8XY* @benchmark(%u8XY*) #1 { entry: %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 %columns = load i32* %1, align 4, !range !0 %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 %rows = load i32* %2, align 4, !range !0 %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32 %rows, i32 1, i8* null) %4 = zext i32 %rows to i64 %dst_y_step = zext i32 %columns to i64 %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 %6 = ptrtoint i8* %5 to i64 %7 = and i64 %6, 31 %8 = icmp eq i64 %7, 0 tail call void @llvm.assume(i1 %8) %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 %10 = ptrtoint i8* %9 to i64 %11 = and i64 %10, 31 %12 = icmp eq i64 %11, 0 tail call void @llvm.assume(i1 %12) %13 = mul nuw nsw i64 %4, %dst_y_step %14 = zext i32 %rows to i64 %15 = zext i32 %columns to i64 %16 = mul nuw i64 %14, %15 %n.vec = and i64 %16, -4 %cmp.zero = icmp eq i64 %n.vec, 0 br i1 %cmp.zero, label %middle.block, label %vector.body.preheader vector.body.preheader: ; preds = %entry br label %vector.body vector.body: ; preds %vector.body.preheader, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index %18 = bitcast i8* %17 to <4 x i8>* %wide.load = load <4 x i8>* %18, align 1 %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7> %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index %21 = bitcast i8* %20 to <4 x i8>* store <4 x i8> %19, <4 x i8>* %21, align 1 %index.next = add i64 %index, 4 %22 = icmp eq i64 %index.next, %n.vec br i1 %22, label %middle.block.loopexit, label %vector.body, !llvm.loop !1 middle.block.loopexit: ; preds = %vector.body br label %middle.block middle.block: ; preds %middle.block.loopexit, %entry %resume.val = phi i64 [ 0, %entry ], [ %n.vec, %middle.block.loopexit ] %cmp.n = icmp eq i64 %16, %resume.val br i1 %cmp.n, label %y_exit, label %x_body.preheader x_body.preheader: ; preds = %middle.block br label %x_body x_body: ; preds %x_body.preheader, %x_body %y = phi i64 [ %y_increment, %x_body ], [ %resume.val, %x_body.preheader ] %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4 %.lobit = lshr i8 %24, 7 %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access !4 %y_increment = add nuw nsw i64 %y, 1 %y_postcondition = icmp eq i64 %y_increment, %13 br i1 %y_postcondition, label %y_exit.loopexit, label %x_body, !llvm.loop !5 y_exit.loopexit: ; preds = %x_body br label %y_exit y_exit: ; preds %y_exit.loopexit, %middle.block ret %u8XY* %3 } attributes #0 = { noduplicate nounwind readonly } attributes #1 = { nounwind } !0 = !{i32 1, i32 -1} !1 = !{!1, !2, !3} !2 = !{!"llvm.loop.vectorize.width", i32 1} !3 = !{!"llvm.loop.interleave.count", i32 1} !4 = !{!4} !5 = !{!5, !2, !3} -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141226/645710fa/attachment.html>
Hal Finkel
2014-Dec-28 22:21 UTC
[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?
----- Original Message -----> From: "Josh Klontz" <josh.klontz at gmail.com> > To: "Dev" <llvmdev at cs.uiuc.edu> > Sent: Friday, December 26, 2014 8:27:43 AM > Subject: [LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment? > > Using LLVM ToT and Hal's helpful slide deck [1], I've been trying to > use `llvm.assume` to communicate pointer alignment guarantees to > vector load and store instructions. For example, in [2] %5 and %9 > are guaranteed to be 32-byte aligned. However, if I run this IR > through `opt -O3 -datalayout -S`, the vectorized loads and stores > are still 1-byte aligned [3]. What's going wrong? Do I have to move > the `llvm.assume` into the loop body?Hi John, The problem is that you're asserting an alignment fact about: %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 and you want this to apply to pointers derived from this value within the loop: %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index I'm pretty sure we currently only look 'up' the use/def chain for alignment facts, and so nothing triggers because %17 is derived from %0, and there is no alignment fact asserted directly on %0. Can you please file a bug report about this (at http://llvm.org/bugs/)? I think that we can likely fix this. -Hal> > > v/r, > Josh > > > > > > [1] > http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf > > > [2] > ; ModuleID = 'align.ll' > > > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } > > > ; Function Attrs: noduplicate nounwind readonly > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 > > > ; Function Attrs: nounwind > declare void @llvm.assume(i1) #1 > > > ; Function Attrs: nounwind > define %u8XY* @benchmark(%u8XY*) #1 { > entry: > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 > %columns = load i32* %1, align 4, !range !0 > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 > %rows = load i32* %2, align 4, !range !0 > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32 > %rows, i32 1, i8* null) > %4 = zext i32 %rows to i64 > %dst_y_step = zext i32 %columns to i64 > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 > %6 = ptrtoint i8* %5 to i64 > %7 = and i64 %6, 31 > %8 = icmp eq i64 %7, 0 > tail call void @llvm.assume(i1 %8) > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > %10 = ptrtoint i8* %9 to i64 > %11 = and i64 %10, 31 > %12 = icmp eq i64 %11, 0 > tail call void @llvm.assume(i1 %12) > %13 = mul nuw nsw i64 %4, %dst_y_step > br label %x_body > > > x_body: ; preds = %x_body, %entry > %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ] > %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y > %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1 > %.lobit = lshr i8 %15, 7 > %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y > store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access !1 > %y_increment = add nuw nsw i64 %y, 1 > %y_postcondition = icmp eq i64 %y_increment, %13 > br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2 > > > y_exit: ; preds = %x_body > ret %u8XY* %3 > } > > > attributes #0 = { noduplicate nounwind readonly } > attributes #1 = { nounwind } > > > !0 = !{i32 1, i32 -1} > !1 = !{!1} > !2 = !{!2} > > > [3] > ; ModuleID = 'align.ll' > > > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } > > > ; Function Attrs: noduplicate nounwind readonly > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 > > > ; Function Attrs: nounwind > declare void @llvm.assume(i1) #1 > > > ; Function Attrs: nounwind > define %u8XY* @benchmark(%u8XY*) #1 { > entry: > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 > %columns = load i32* %1, align 4, !range !0 > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 > %rows = load i32* %2, align 4, !range !0 > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32 > %rows, i32 1, i8* null) > %4 = zext i32 %rows to i64 > %dst_y_step = zext i32 %columns to i64 > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 > %6 = ptrtoint i8* %5 to i64 > %7 = and i64 %6, 31 > %8 = icmp eq i64 %7, 0 > tail call void @llvm.assume(i1 %8) > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > %10 = ptrtoint i8* %9 to i64 > %11 = and i64 %10, 31 > %12 = icmp eq i64 %11, 0 > tail call void @llvm.assume(i1 %12) > %13 = mul nuw nsw i64 %4, %dst_y_step > %14 = zext i32 %rows to i64 > %15 = zext i32 %columns to i64 > %16 = mul nuw i64 %14, %15 > %n.vec = and i64 %16, -4 > %cmp.zero = icmp eq i64 %n.vec, 0 > br i1 %cmp.zero, label %middle.block, label %vector.body.preheader > > > vector.body.preheader: ; preds = %entry > br label %vector.body > > > vector.body: ; preds = %vector.body.preheader, %vector.body > %index = phi i64 [ %index.next, %vector.body ], [ 0, > %vector.body.preheader ] > %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index > %18 = bitcast i8* %17 to <4 x i8>* > %wide.load = load <4 x i8>* %18, align 1 > %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7> > %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index > %21 = bitcast i8* %20 to <4 x i8>* > store <4 x i8> %19, <4 x i8>* %21, align 1 > %index.next = add i64 %index, 4 > %22 = icmp eq i64 %index.next, %n.vec > br i1 %22, label %middle.block.loopexit, label %vector.body, > !llvm.loop !1 > > > middle.block.loopexit: ; preds = %vector.body > br label %middle.block > > > middle.block: ; preds = %middle.block.loopexit, %entry > %resume.val = phi i64 [ 0, %entry ], [ %n.vec, %middle.block.loopexit > ] > %cmp.n = icmp eq i64 %16, %resume.val > br i1 %cmp.n, label %y_exit, label %x_body.preheader > > > x_body.preheader: ; preds = %middle.block > br label %x_body > > > x_body: ; preds = %x_body.preheader, %x_body > %y = phi i64 [ %y_increment, %x_body ], [ %resume.val, > %x_body.preheader ] > %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y > %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4 > %.lobit = lshr i8 %24, 7 > %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y > store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access !4 > %y_increment = add nuw nsw i64 %y, 1 > %y_postcondition = icmp eq i64 %y_increment, %13 > br i1 %y_postcondition, label %y_exit.loopexit, label %x_body, > !llvm.loop !5 > > > y_exit.loopexit: ; preds = %x_body > br label %y_exit > > > y_exit: ; preds = %y_exit.loopexit, %middle.block > ret %u8XY* %3 > } > > > attributes #0 = { noduplicate nounwind readonly } > attributes #1 = { nounwind } > > > !0 = !{i32 1, i32 -1} > !1 = !{!1, !2, !3} > !2 = !{!"llvm.loop.vectorize.width", i32 1} > !3 = !{!"llvm.loop.interleave.count", i32 1} > !4 = !{!4} > !5 = !{!5, !2, !3} > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev >-- Hal Finkel Assistant Computational Scientist Leadership Computing Facility Argonne National Laboratory
Hal Finkel
2014-Dec-28 22:23 UTC
[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?
----- Original Message -----> From: "Hal Finkel" <hfinkel at anl.gov> > To: "Josh Klontz" <josh.klontz at gmail.com> > Cc: "Dev" <llvmdev at cs.uiuc.edu> > Sent: Sunday, December 28, 2014 4:21:51 PM > Subject: Re: [LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment? > > ----- Original Message ----- > > From: "Josh Klontz" <josh.klontz at gmail.com> > > To: "Dev" <llvmdev at cs.uiuc.edu> > > Sent: Friday, December 26, 2014 8:27:43 AM > > Subject: [LLVMdev] Correct usage of `llvm.assume` for loop > > vectorization alignment? > > > > Using LLVM ToT and Hal's helpful slide deck [1], I've been trying > > to > > use `llvm.assume` to communicate pointer alignment guarantees to > > vector load and store instructions. For example, in [2] %5 and %9 > > are guaranteed to be 32-byte aligned. However, if I run this IR > > through `opt -O3 -datalayout -S`, the vectorized loads and stores > > are still 1-byte aligned [3]. What's going wrong? Do I have to move > > the `llvm.assume` into the loop body? > > Hi John,[Err, Josh. Sorry about that]. -Hal> > The problem is that you're asserting an alignment fact about: > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > > and you want this to apply to pointers derived from this value within > the loop: > %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index > > I'm pretty sure we currently only look 'up' the use/def chain for > alignment facts, and so nothing triggers because %17 is derived from > %0, and there is no alignment fact asserted directly on %0. > > Can you please file a bug report about this (at > http://llvm.org/bugs/)? I think that we can likely fix this. > > -Hal > > > > > > > v/r, > > Josh > > > > > > > > > > > > [1] > > http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf > > > > > > [2] > > ; ModuleID = 'align.ll' > > > > > > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } > > > > > > ; Function Attrs: noduplicate nounwind readonly > > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 > > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 > > > > > > ; Function Attrs: nounwind > > declare void @llvm.assume(i1) #1 > > > > > > ; Function Attrs: nounwind > > define %u8XY* @benchmark(%u8XY*) #1 { > > entry: > > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 > > %columns = load i32* %1, align 4, !range !0 > > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 > > %rows = load i32* %2, align 4, !range !0 > > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, > > i32 > > %rows, i32 1, i8* null) > > %4 = zext i32 %rows to i64 > > %dst_y_step = zext i32 %columns to i64 > > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 > > %6 = ptrtoint i8* %5 to i64 > > %7 = and i64 %6, 31 > > %8 = icmp eq i64 %7, 0 > > tail call void @llvm.assume(i1 %8) > > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > > %10 = ptrtoint i8* %9 to i64 > > %11 = and i64 %10, 31 > > %12 = icmp eq i64 %11, 0 > > tail call void @llvm.assume(i1 %12) > > %13 = mul nuw nsw i64 %4, %dst_y_step > > br label %x_body > > > > > > x_body: ; preds = %x_body, %entry > > %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ] > > %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y > > %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1 > > %.lobit = lshr i8 %15, 7 > > %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y > > store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access > > !1 > > %y_increment = add nuw nsw i64 %y, 1 > > %y_postcondition = icmp eq i64 %y_increment, %13 > > br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2 > > > > > > y_exit: ; preds = %x_body > > ret %u8XY* %3 > > } > > > > > > attributes #0 = { noduplicate nounwind readonly } > > attributes #1 = { nounwind } > > > > > > !0 = !{i32 1, i32 -1} > > !1 = !{!1} > > !2 = !{!2} > > > > > > [3] > > ; ModuleID = 'align.ll' > > > > > > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } > > > > > > ; Function Attrs: noduplicate nounwind readonly > > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 > > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 > > > > > > ; Function Attrs: nounwind > > declare void @llvm.assume(i1) #1 > > > > > > ; Function Attrs: nounwind > > define %u8XY* @benchmark(%u8XY*) #1 { > > entry: > > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 > > %columns = load i32* %1, align 4, !range !0 > > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 > > %rows = load i32* %2, align 4, !range !0 > > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, > > i32 > > %rows, i32 1, i8* null) > > %4 = zext i32 %rows to i64 > > %dst_y_step = zext i32 %columns to i64 > > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 > > %6 = ptrtoint i8* %5 to i64 > > %7 = and i64 %6, 31 > > %8 = icmp eq i64 %7, 0 > > tail call void @llvm.assume(i1 %8) > > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > > %10 = ptrtoint i8* %9 to i64 > > %11 = and i64 %10, 31 > > %12 = icmp eq i64 %11, 0 > > tail call void @llvm.assume(i1 %12) > > %13 = mul nuw nsw i64 %4, %dst_y_step > > %14 = zext i32 %rows to i64 > > %15 = zext i32 %columns to i64 > > %16 = mul nuw i64 %14, %15 > > %n.vec = and i64 %16, -4 > > %cmp.zero = icmp eq i64 %n.vec, 0 > > br i1 %cmp.zero, label %middle.block, label %vector.body.preheader > > > > > > vector.body.preheader: ; preds = %entry > > br label %vector.body > > > > > > vector.body: ; preds = %vector.body.preheader, %vector.body > > %index = phi i64 [ %index.next, %vector.body ], [ 0, > > %vector.body.preheader ] > > %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index > > %18 = bitcast i8* %17 to <4 x i8>* > > %wide.load = load <4 x i8>* %18, align 1 > > %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7> > > %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index > > %21 = bitcast i8* %20 to <4 x i8>* > > store <4 x i8> %19, <4 x i8>* %21, align 1 > > %index.next = add i64 %index, 4 > > %22 = icmp eq i64 %index.next, %n.vec > > br i1 %22, label %middle.block.loopexit, label %vector.body, > > !llvm.loop !1 > > > > > > middle.block.loopexit: ; preds = %vector.body > > br label %middle.block > > > > > > middle.block: ; preds = %middle.block.loopexit, %entry > > %resume.val = phi i64 [ 0, %entry ], [ %n.vec, > > %middle.block.loopexit > > ] > > %cmp.n = icmp eq i64 %16, %resume.val > > br i1 %cmp.n, label %y_exit, label %x_body.preheader > > > > > > x_body.preheader: ; preds = %middle.block > > br label %x_body > > > > > > x_body: ; preds = %x_body.preheader, %x_body > > %y = phi i64 [ %y_increment, %x_body ], [ %resume.val, > > %x_body.preheader ] > > %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y > > %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4 > > %.lobit = lshr i8 %24, 7 > > %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y > > store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access > > !4 > > %y_increment = add nuw nsw i64 %y, 1 > > %y_postcondition = icmp eq i64 %y_increment, %13 > > br i1 %y_postcondition, label %y_exit.loopexit, label %x_body, > > !llvm.loop !5 > > > > > > y_exit.loopexit: ; preds = %x_body > > br label %y_exit > > > > > > y_exit: ; preds = %y_exit.loopexit, %middle.block > > ret %u8XY* %3 > > } > > > > > > attributes #0 = { noduplicate nounwind readonly } > > attributes #1 = { nounwind } > > > > > > !0 = !{i32 1, i32 -1} > > !1 = !{!1, !2, !3} > > !2 = !{!"llvm.loop.vectorize.width", i32 1} > > !3 = !{!"llvm.loop.interleave.count", i32 1} > > !4 = !{!4} > > !5 = !{!5, !2, !3} > > _______________________________________________ > > LLVM Developers mailing list > > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > > > > -- > Hal Finkel > Assistant Computational Scientist > Leadership Computing Facility > Argonne National Laboratory >-- Hal Finkel Assistant Computational Scientist Leadership Computing Facility Argonne National Laboratory
Josh Klontz
2014-Dec-29 15:14 UTC
[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?
Makes sense, thanks for the quick response. I've filed a bug report: http://llvm.org/bugs/show_bug.cgi?id=22049 In general, it appears that the issue arises when articulating the alignment of an array in a struct. In this case, loads and stores from the array don't share a use/def chain with the alignment fact. I believe this is a result of a prior optimization pass canonicalizing 1) a struct GEP to the start of the array, followed by 2) a GEP indexing into the array, into a single 1+2) struct GEP with an offset. (I didn't share my original un-optimized IR with an intact use-def chain as I didn't realize the underlying issue until now). Alternatively, if there was a way to add alignment metadata to an array in a struct type declaration, that would offer a more direct solution for my specific use case. To my knowledge, such an annotation is not supported. v/r, Josh On Sun, Dec 28, 2014 at 5:21 PM, Hal Finkel <hfinkel at anl.gov> wrote:> ----- Original Message ----- > > From: "Josh Klontz" <josh.klontz at gmail.com> > > To: "Dev" <llvmdev at cs.uiuc.edu> > > Sent: Friday, December 26, 2014 8:27:43 AM > > Subject: [LLVMdev] Correct usage of `llvm.assume` for loop > vectorization alignment? > > > > Using LLVM ToT and Hal's helpful slide deck [1], I've been trying to > > use `llvm.assume` to communicate pointer alignment guarantees to > > vector load and store instructions. For example, in [2] %5 and %9 > > are guaranteed to be 32-byte aligned. However, if I run this IR > > through `opt -O3 -datalayout -S`, the vectorized loads and stores > > are still 1-byte aligned [3]. What's going wrong? Do I have to move > > the `llvm.assume` into the loop body? > > Hi John, > > The problem is that you're asserting an alignment fact about: > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > > and you want this to apply to pointers derived from this value within the > loop: > %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index > > I'm pretty sure we currently only look 'up' the use/def chain for > alignment facts, and so nothing triggers because %17 is derived from %0, > and there is no alignment fact asserted directly on %0. > > Can you please file a bug report about this (at http://llvm.org/bugs/)? I > think that we can likely fix this. > > -Hal > > > > > > > v/r, > > Josh > > > > > > > > > > > > [1] > > > http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf > > > > > > [2] > > ; ModuleID = 'align.ll' > > > > > > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } > > > > > > ; Function Attrs: noduplicate nounwind readonly > > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 > > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 > > > > > > ; Function Attrs: nounwind > > declare void @llvm.assume(i1) #1 > > > > > > ; Function Attrs: nounwind > > define %u8XY* @benchmark(%u8XY*) #1 { > > entry: > > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 > > %columns = load i32* %1, align 4, !range !0 > > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 > > %rows = load i32* %2, align 4, !range !0 > > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32 > > %rows, i32 1, i8* null) > > %4 = zext i32 %rows to i64 > > %dst_y_step = zext i32 %columns to i64 > > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 > > %6 = ptrtoint i8* %5 to i64 > > %7 = and i64 %6, 31 > > %8 = icmp eq i64 %7, 0 > > tail call void @llvm.assume(i1 %8) > > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > > %10 = ptrtoint i8* %9 to i64 > > %11 = and i64 %10, 31 > > %12 = icmp eq i64 %11, 0 > > tail call void @llvm.assume(i1 %12) > > %13 = mul nuw nsw i64 %4, %dst_y_step > > br label %x_body > > > > > > x_body: ; preds = %x_body, %entry > > %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ] > > %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y > > %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1 > > %.lobit = lshr i8 %15, 7 > > %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y > > store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access !1 > > %y_increment = add nuw nsw i64 %y, 1 > > %y_postcondition = icmp eq i64 %y_increment, %13 > > br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2 > > > > > > y_exit: ; preds = %x_body > > ret %u8XY* %3 > > } > > > > > > attributes #0 = { noduplicate nounwind readonly } > > attributes #1 = { nounwind } > > > > > > !0 = !{i32 1, i32 -1} > > !1 = !{!1} > > !2 = !{!2} > > > > > > [3] > > ; ModuleID = 'align.ll' > > > > > > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] } > > > > > > ; Function Attrs: noduplicate nounwind readonly > > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 > > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0 > > > > > > ; Function Attrs: nounwind > > declare void @llvm.assume(i1) #1 > > > > > > ; Function Attrs: nounwind > > define %u8XY* @benchmark(%u8XY*) #1 { > > entry: > > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3 > > %columns = load i32* %1, align 4, !range !0 > > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4 > > %rows = load i32* %2, align 4, !range !0 > > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32 > > %rows, i32 1, i8* null) > > %4 = zext i32 %rows to i64 > > %dst_y_step = zext i32 %columns to i64 > > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0 > > %6 = ptrtoint i8* %5 to i64 > > %7 = and i64 %6, 31 > > %8 = icmp eq i64 %7, 0 > > tail call void @llvm.assume(i1 %8) > > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0 > > %10 = ptrtoint i8* %9 to i64 > > %11 = and i64 %10, 31 > > %12 = icmp eq i64 %11, 0 > > tail call void @llvm.assume(i1 %12) > > %13 = mul nuw nsw i64 %4, %dst_y_step > > %14 = zext i32 %rows to i64 > > %15 = zext i32 %columns to i64 > > %16 = mul nuw i64 %14, %15 > > %n.vec = and i64 %16, -4 > > %cmp.zero = icmp eq i64 %n.vec, 0 > > br i1 %cmp.zero, label %middle.block, label %vector.body.preheader > > > > > > vector.body.preheader: ; preds = %entry > > br label %vector.body > > > > > > vector.body: ; preds = %vector.body.preheader, %vector.body > > %index = phi i64 [ %index.next, %vector.body ], [ 0, > > %vector.body.preheader ] > > %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index > > %18 = bitcast i8* %17 to <4 x i8>* > > %wide.load = load <4 x i8>* %18, align 1 > > %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7> > > %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index > > %21 = bitcast i8* %20 to <4 x i8>* > > store <4 x i8> %19, <4 x i8>* %21, align 1 > > %index.next = add i64 %index, 4 > > %22 = icmp eq i64 %index.next, %n.vec > > br i1 %22, label %middle.block.loopexit, label %vector.body, > > !llvm.loop !1 > > > > > > middle.block.loopexit: ; preds = %vector.body > > br label %middle.block > > > > > > middle.block: ; preds = %middle.block.loopexit, %entry > > %resume.val = phi i64 [ 0, %entry ], [ %n.vec, %middle.block.loopexit > > ] > > %cmp.n = icmp eq i64 %16, %resume.val > > br i1 %cmp.n, label %y_exit, label %x_body.preheader > > > > > > x_body.preheader: ; preds = %middle.block > > br label %x_body > > > > > > x_body: ; preds = %x_body.preheader, %x_body > > %y = phi i64 [ %y_increment, %x_body ], [ %resume.val, > > %x_body.preheader ] > > %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y > > %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4 > > %.lobit = lshr i8 %24, 7 > > %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y > > store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access !4 > > %y_increment = add nuw nsw i64 %y, 1 > > %y_postcondition = icmp eq i64 %y_increment, %13 > > br i1 %y_postcondition, label %y_exit.loopexit, label %x_body, > > !llvm.loop !5 > > > > > > y_exit.loopexit: ; preds = %x_body > > br label %y_exit > > > > > > y_exit: ; preds = %y_exit.loopexit, %middle.block > > ret %u8XY* %3 > > } > > > > > > attributes #0 = { noduplicate nounwind readonly } > > attributes #1 = { nounwind } > > > > > > !0 = !{i32 1, i32 -1} > > !1 = !{!1, !2, !3} > > !2 = !{!"llvm.loop.vectorize.width", i32 1} > > !3 = !{!"llvm.loop.interleave.count", i32 1} > > !4 = !{!4} > > !5 = !{!5, !2, !3} > > _______________________________________________ > > LLVM Developers mailing list > > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > > > > -- > Hal Finkel > Assistant Computational Scientist > Leadership Computing Facility > Argonne National Laboratory >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141229/7df35e92/attachment.html>
Apparently Analagous Threads
- [LLVMdev] [LV] possible `vector.memcheck` regression when using `llvm.loop` and `llvm.mem.parallel_loop_access`
- [LLVMdev] Will any pass change simple return branch into select/return pair?
- Issues with new Attributor (replaceAllUses fails with type mismatch)
- [LLVMdev] [cfe-commits] [PATCH/RFC, PowerPC] Extend 32-bit function arguments / return values
- Issues with new Attributor (replaceAllUses fails with type mismatch)