Neil Henning via llvm-dev
2020-Jul-16 13:25 UTC
[llvm-dev] LLVM 11 and trunk selecting 4 wide instead of 8 wide loop vectorization for AVX-enabled target
Hey list, I've recently done the first test run of bumping our Burst compiler from LLVM 10 -> 11 now that the branch has been cut, and have noticed an apparent loop vectorization codegen regression for X86 with AVX or AVX2 enabled. The following IR example is vectorized to 4 wide with LLVM 11 and trunk whereas in LLVM 10 it (correctly as per what we want) vectorized it 8 wide matching the ymm registers. ; ModuleID = '../test.ll' source_filename = "main" target datalayout "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc-coff" %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0" = type { float*, i32, [4 x i8] } ; Function Attrs: nofree define dllexport void @func(float* noalias nocapture %output, %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* nocapture nonnull readonly dereferenceable(16) %a, %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* nocapture nonnull readonly dereferenceable(16) %b) local_unnamed_addr #0 !ubaa. !1 { entry: %0 = getelementptr %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0", %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %a, i64 0, i32 1 %1 = load i32, i32* %0, align 1 %.not = icmp eq i32 %1, 0 br i1 %.not, label %BL.0042, label %BL.0005.lr.ph BL.0005.lr.ph: ; preds = %entry %2 = bitcast %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %a to i8** %3 = load i8*, i8** %2, align 1 %4 = bitcast %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %b to i8** %5 = load i8*, i8** %4, align 1 %wide.trip.count = zext i32 %1 to i64 br label %BL.0005 BL.0005: ; preds = %BL.0005, % BL.0005.lr.ph %indvars.iv = phi i64 [ 0, %BL.0005.lr.ph ], [ %indvars.iv.next, %BL.0005 ] %6 = shl nuw nsw i64 %indvars.iv, 2 %7 = getelementptr float, float* %output, i64 %indvars.iv %8 = getelementptr i8, i8* %3, i64 %6 %9 = bitcast i8* %8 to float* %10 = load float, float* %9, align 4 %11 = getelementptr i8, i8* %5, i64 %6 %12 = bitcast i8* %11 to float* %13 = load float, float* %12, align 4 %14 = tail call float @llvm.pow.f32(float %10, float %13) store float %14, float* %7, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %BL.0042, label %BL.0005 BL.0042: ; preds = %BL.0005, %entry ret void } ; Function Attrs: norecurse readnone define dllexport void @burst.initialize(i8* (i8*)* nocapture readnone %callback) local_unnamed_addr #1 !ubaa. !0 { entry: ret void } ; Function Attrs: nounwind readnone speculatable willreturn declare float @llvm.pow.f32(float, float) #2 attributes #0 = { nofree } attributes #1 = { norecurse readnone } attributes #2 = { nounwind readnone speculatable willreturn } !ubaa.Burst.Compiler.IL.Tests.VectorsMaths\2FFloatPointer.0 = !{!0, !0, !0, !0} !0 = !{i1 false} !1 = !{i1 true, i1 false, i1 false} If I run this with ../llvm-project/llvm/build/bin/opt.exe -o - -S -O3 ../avx_sad_4.ll -mattr=avx -debug, I can see that the loop vectorizer correctly considers using 8-wide ymm registers for this, but has decided that the 4-wide variant is cheaper based on some cost modelling I don't understand. So is this expected behaviour? I know there was some cost model changes in the 10->11 timeframe. Thanks for any help, Cheers, -Neil. -- Neil Henning Senior Software Engineer Compiler unity.com -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200716/337f6f39/attachment.html>
Roman Lebedev via llvm-dev
2020-Jul-16 13:38 UTC
[llvm-dev] LLVM 11 and trunk selecting 4 wide instead of 8 wide loop vectorization for AVX-enabled target
Did you specify the target CPU the code should be optimized for? For clang that is -march=native/znver2/... / -mtune=<same> For opt/llc that is --mcpu=<same> I would expect that by default, some generic baseline is picked. On Thu, Jul 16, 2020 at 4:25 PM Neil Henning via llvm-dev < llvm-dev at lists.llvm.org> wrote:> Hey list, > > I've recently done the first test run of bumping our Burst compiler from > LLVM 10 -> 11 now that the branch has been cut, and have noticed an > apparent loop vectorization codegen regression for X86 with AVX or AVX2 > enabled. The following IR example is vectorized to 4 wide with LLVM 11 and > trunk whereas in LLVM 10 it (correctly as per what we want) vectorized it 8 > wide matching the ymm registers. > > ; ModuleID = '../test.ll' > source_filename = "main" > target datalayout > "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-pc-windows-msvc-coff" > > %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0" = type { float*, > i32, [4 x i8] } > > ; Function Attrs: nofree > define dllexport void @func(float* noalias nocapture %output, > %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* nocapture nonnull > readonly dereferenceable(16) %a, > %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* nocapture nonnull > readonly dereferenceable(16) %b) local_unnamed_addr #0 !ubaa. !1 { > entry: > %0 = getelementptr > %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0", > %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %a, i64 0, i32 1 > %1 = load i32, i32* %0, align 1 > %.not = icmp eq i32 %1, 0 > br i1 %.not, label %BL.0042, label %BL.0005.lr.ph > > BL.0005.lr.ph: ; preds = %entry > %2 = bitcast %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %a > to i8** > %3 = load i8*, i8** %2, align 1 > %4 = bitcast %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %b > to i8** > %5 = load i8*, i8** %4, align 1 > %wide.trip.count = zext i32 %1 to i64 > br label %BL.0005 > > BL.0005: ; preds = %BL.0005, % > BL.0005.lr.ph > %indvars.iv = phi i64 [ 0, %BL.0005.lr.ph ], [ %indvars.iv.next, > %BL.0005 ] > %6 = shl nuw nsw i64 %indvars.iv, 2 > %7 = getelementptr float, float* %output, i64 %indvars.iv > %8 = getelementptr i8, i8* %3, i64 %6 > %9 = bitcast i8* %8 to float* > %10 = load float, float* %9, align 4 > %11 = getelementptr i8, i8* %5, i64 %6 > %12 = bitcast i8* %11 to float* > %13 = load float, float* %12, align 4 > %14 = tail call float @llvm.pow.f32(float %10, float %13) > store float %14, float* %7, align 4 > %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 > %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count > br i1 %exitcond.not, label %BL.0042, label %BL.0005 > > BL.0042: ; preds = %BL.0005, > %entry > ret void > } > > ; Function Attrs: norecurse readnone > define dllexport void @burst.initialize(i8* (i8*)* nocapture readnone > %callback) local_unnamed_addr #1 !ubaa. !0 { > entry: > ret void > } > > ; Function Attrs: nounwind readnone speculatable willreturn > declare float @llvm.pow.f32(float, float) #2 > > attributes #0 = { nofree } > attributes #1 = { norecurse readnone } > attributes #2 = { nounwind readnone speculatable willreturn } > > !ubaa.Burst.Compiler.IL.Tests.VectorsMaths\2FFloatPointer.0 = !{!0, !0, > !0, !0} > > !0 = !{i1 false} > !1 = !{i1 true, i1 false, i1 false} > > If I run this with ../llvm-project/llvm/build/bin/opt.exe -o - -S -O3 > ../avx_sad_4.ll -mattr=avx -debug, I can see that the loop vectorizer > correctly considers using 8-wide ymm registers for this, but has decided > that the 4-wide variant is cheaper based on some cost modelling I don't > understand. > > So is this expected behaviour? I know there was some cost model changes in > the 10->11 timeframe. > > Thanks for any help, > > Cheers, > -Neil. >Roman> -- > Neil Henning > Senior Software Engineer Compiler > unity.com > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200716/bb590ea5/attachment.html>
Neil Henning via llvm-dev
2020-Jul-16 14:20 UTC
[llvm-dev] LLVM 11 and trunk selecting 4 wide instead of 8 wide loop vectorization for AVX-enabled target
Tried a bunch of them there (x86-64, haswell, znver2) and they all defaulted to 4-wide - haswell additionally caused some extra loop unrolling but still with 8-wide pows. Cheers, -Neil. On Thu, Jul 16, 2020 at 2:39 PM Roman Lebedev <lebedev.ri at gmail.com> wrote:> Did you specify the target CPU the code should be optimized for? > For clang that is -march=native/znver2/... / -mtune=<same> > For opt/llc that is --mcpu=<same> > I would expect that by default, some generic baseline is picked. > > On Thu, Jul 16, 2020 at 4:25 PM Neil Henning via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hey list, >> >> I've recently done the first test run of bumping our Burst compiler from >> LLVM 10 -> 11 now that the branch has been cut, and have noticed an >> apparent loop vectorization codegen regression for X86 with AVX or AVX2 >> enabled. The following IR example is vectorized to 4 wide with LLVM 11 and >> trunk whereas in LLVM 10 it (correctly as per what we want) vectorized it 8 >> wide matching the ymm registers. >> >> ; ModuleID = '../test.ll' >> source_filename = "main" >> target datalayout >> "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" >> target triple = "x86_64-pc-windows-msvc-coff" >> >> %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0" = type { float*, >> i32, [4 x i8] } >> >> ; Function Attrs: nofree >> define dllexport void @func(float* noalias nocapture %output, >> %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* nocapture nonnull >> readonly dereferenceable(16) %a, >> %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* nocapture nonnull >> readonly dereferenceable(16) %b) local_unnamed_addr #0 !ubaa. !1 { >> entry: >> %0 = getelementptr >> %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0", >> %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %a, i64 0, i32 1 >> %1 = load i32, i32* %0, align 1 >> %.not = icmp eq i32 %1, 0 >> br i1 %.not, label %BL.0042, label %BL.0005.lr.ph >> >> BL.0005.lr.ph: ; preds = %entry >> %2 = bitcast %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %a >> to i8** >> %3 = load i8*, i8** %2, align 1 >> %4 = bitcast %"Burst.Compiler.IL.Tests.VectorsMaths/FloatPointer.0"* %b >> to i8** >> %5 = load i8*, i8** %4, align 1 >> %wide.trip.count = zext i32 %1 to i64 >> br label %BL.0005 >> >> BL.0005: ; preds = %BL.0005, % >> BL.0005.lr.ph >> %indvars.iv = phi i64 [ 0, %BL.0005.lr.ph ], [ %indvars.iv.next, >> %BL.0005 ] >> %6 = shl nuw nsw i64 %indvars.iv, 2 >> %7 = getelementptr float, float* %output, i64 %indvars.iv >> %8 = getelementptr i8, i8* %3, i64 %6 >> %9 = bitcast i8* %8 to float* >> %10 = load float, float* %9, align 4 >> %11 = getelementptr i8, i8* %5, i64 %6 >> %12 = bitcast i8* %11 to float* >> %13 = load float, float* %12, align 4 >> %14 = tail call float @llvm.pow.f32(float %10, float %13) >> store float %14, float* %7, align 4 >> %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 >> %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count >> br i1 %exitcond.not, label %BL.0042, label %BL.0005 >> >> BL.0042: ; preds = %BL.0005, >> %entry >> ret void >> } >> >> ; Function Attrs: norecurse readnone >> define dllexport void @burst.initialize(i8* (i8*)* nocapture readnone >> %callback) local_unnamed_addr #1 !ubaa. !0 { >> entry: >> ret void >> } >> >> ; Function Attrs: nounwind readnone speculatable willreturn >> declare float @llvm.pow.f32(float, float) #2 >> >> attributes #0 = { nofree } >> attributes #1 = { norecurse readnone } >> attributes #2 = { nounwind readnone speculatable willreturn } >> >> !ubaa.Burst.Compiler.IL.Tests.VectorsMaths\2FFloatPointer.0 = !{!0, !0, >> !0, !0} >> >> !0 = !{i1 false} >> !1 = !{i1 true, i1 false, i1 false} >> >> If I run this with ../llvm-project/llvm/build/bin/opt.exe -o - -S -O3 >> ../avx_sad_4.ll -mattr=avx -debug, I can see that the loop vectorizer >> correctly considers using 8-wide ymm registers for this, but has decided >> that the 4-wide variant is cheaper based on some cost modelling I don't >> understand. >> >> So is this expected behaviour? I know there was some cost model changes >> in the 10->11 timeframe. >> >> Thanks for any help, >> >> Cheers, >> -Neil. >> > Roman > > >> -- >> Neil Henning >> Senior Software Engineer Compiler >> unity.com >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >-- Neil Henning Senior Software Engineer Compiler unity.com -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200716/ed9e83d1/attachment.html>
Reasonably Related Threads
- LLVM 11 and trunk selecting 4 wide instead of 8 wide loop vectorization for AVX-enabled target
- LLVM 11 and trunk selecting 4 wide instead of 8 wide loop vectorization for AVX-enabled target
- Help with SROA throwing away no-alias information
- [RFC] `opt-out` attribute list for intrinsics
- loop unrolling introduces conditional branch