thr3ads.net - llvm dev - [llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture [Jun 2016]

If this information is useful, please help other people find it:
Share via:

Frank Winter via llvm-dev

2016-Jun-23 16:53 UTC

[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture

With LLVM 3.8 the JIT compiler engine generates an AVX512 instruction 
although I target an 'avx2' CPU (intel Core I7).
I just downloaded the most recent 3.8 and still it happens.

It happens with this input module:


target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

define void @module_cFFEMJ(i64 %lo, i64 %hi, i64 %myId, i1 %ordered, i64 
%start, i32* noalias align 32 %arg0, i32* noalias align 32 %arg1) {
entrypoint:
   %0 = add nsw i64 %lo, %start
   %1 = add nsw i64 %hi, %start
   %2 = select i1 %ordered, i64 %0, i64 %lo
   %3 = select i1 %ordered, i64 %1, i64 %hi
   %4 = sdiv i64 %2, 4
   %5 = sdiv i64 %3, 4
   %6 = bitcast i32* %arg1 to i64*
   %7 = load i64, i64* %6, align 32
   %8 = trunc i64 %7 to i32
   %9 = getelementptr i32, i32* %arg1, i64 1
   %10 = lshr i64 %7, 32
   %11 = trunc i64 %10 to i32
   %12 = getelementptr i32, i32* %arg1, i64 2
   %13 = bitcast i32* %12 to i64*
   %14 = load i64, i64* %13, align 8
   %15 = trunc i64 %14 to i32
   %16 = getelementptr i32, i32* %arg1, i64 3
   %17 = lshr i64 %14, 32
   %18 = trunc i64 %17 to i32
   br label %L5

L5:                                               ; preds = %L5, %entrypoint
   %19 = phi i64 [ %32, %L5 ], [ %4, %entrypoint ]
   %20 = shl i64 %19, 4
   %21 = or i64 %20, 4
   %22 = or i64 %20, 8
   %23 = or i64 %20, 12
   %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %8, i32 0
   %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, 
<4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %11, i32
0
   %broadcast.splat12 = shufflevector <4 x i32> 
%broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x i32> undef, i32 %15, i32
0
   %broadcast.splat14 = shufflevector <4 x i32> 
%broadcast.splatinsert13, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %18, i32
0
   %broadcast.splat16 = shufflevector <4 x i32> 
%broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
   %24 = getelementptr i32, i32* %arg0, i64 %20
   %25 = bitcast i32* %24 to <4 x i32>*
   store <4 x i32> %broadcast.splat10, <4 x i32>* %25, align 16
   %26 = getelementptr i32, i32* %arg0, i64 %21
   %27 = bitcast i32* %26 to <4 x i32>*
   store <4 x i32> %broadcast.splat12, <4 x i32>* %27, align 16
   %28 = getelementptr i32, i32* %arg0, i64 %22
   %29 = bitcast i32* %28 to <4 x i32>*
   store <4 x i32> %broadcast.splat14, <4 x i32>* %29, align 16
   %30 = getelementptr i32, i32* %arg0, i64 %23
   %31 = bitcast i32* %30 to <4 x i32>*
   store <4 x i32> %broadcast.splat16, <4 x i32>* %31, align 16
   %32 = add nsw i64 %19, 1
   %33 = icmp slt i64 %32, %5
   br i1 %33, label %L5, label %L6

L6:                                               ; preds = %L5
   ret void
}


The following code line show how I call the JIT compiler. ('Mod' is 
pointing to the module).

llvm::EngineBuilder 
engineBuilder(std::move(std::unique_ptr<llvm::Module>(Mod)));
engineBuilder.setMCPU(llvm::sys::getHostCPUName());
engineBuilder.setEngineKind(llvm::EngineKind::JIT);
engineBuilder.setOptLevel(llvm::CodeGenOpt::Aggressive);
engineBuilder.setErrorStr(&mcjit_error);

llvm::TargetOptions targetOptions;
targetOptions.AllowFPOpFusion = llvm::FPOpFusion::Fast;
engineBuilder.setTargetOptions( targetOptions );

TheExecutionEngine = engineBuilder.create();

targetMachine = engineBuilder.selectTarget();
Mod->setDataLayout( targetMachine->createDataLayout() );

TheExecutionEngine->finalizeObject();  // MCJIT
fptr_mainFunc_extern = TheExecutionEngine->getPointerToFunction( 
mainFunc_extern );


When calling the function an 'illegal instruction' is raised.
Looking at the assembler reveals an AVX512 instruction which shouldn't 
be there.

Assembly:
     .text
     .file    "module"
     .globl    main
     .align    16, 0x90
     .type    main, at function
main:
     .cfi_startproc
     movq    8(%rsp), %r10
     leaq    (%rdi,%r8), %rdx
     addq    %rsi, %r8
     testb    $1, %cl
     cmoveq    %rdi, %rdx
     cmoveq    %rsi, %r8
     movq    %rdx, %rax
     sarq    $63, %rax
     shrq    $62, %rax
     addq    %rdx, %rax
     sarq    $2, %rax
     movq    %r8, %rcx
     sarq    $63, %rcx
     shrq    $62, %rcx
     addq    %r8, %rcx
     sarq    $2, %rcx
     movq    (%r10), %r8
     movq    8(%r10), %r10
     movq    %r8, %rdi
     shrq    $32, %rdi
     movq    %r10, %rsi
     shrq    $32, %rsi
     movq    %rax, %rdx
     shlq    $6, %rdx
     leaq    48(%rdx,%r9), %rdx
     .align    16, 0x90
.LBB0_1:
     vmovd    %r8d, %xmm0
     vpbroadcastd    %xmm0, %xmm0
     vmovd    %edi, %xmm1
     vpbroadcastd    %xmm1, %xmm1
     vmovd    %r10d, %xmm2
     vpbroadcastd    %xmm2, %xmm2
     vmovd    %esi, %xmm3
     vpbroadcastd    %xmm3, %xmm3
     vmovdqa32    %xmm0, -48(%rdx)
     vmovdqa32    %xmm1, -32(%rdx)
     vmovdqa32    %xmm2, -16(%rdx)
     vmovdqa32    %xmm3, (%rdx)
     addq    $1, %rax
     addq    $64, %rdx
     cmpq    %rcx, %rax
     jl    .LBB0_1
     retq
.Lfunc_end0:
     .size    main, .Lfunc_end0-main
     .cfi_endproc


     .section    ".note.GNU-stack","", at progbits

end assembly!

I am not sure what instruction is the offending one, but the 'vmovdqa32'
looks avx512.

I wasn't able to reproduce this with 'opt' - it generates avx2 
instructions. And when I force it to use e.g. avx512f it rejects the CPU 
type.

Any ideas?


Frank

Craig Topper via llvm-dev

2016-Jun-23 16:56 UTC

head link

[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture

Can you check what value "getHostCPUName" returned?

On Thu, Jun 23, 2016 at 9:53 AM, Frank Winter via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> With LLVM 3.8 the JIT compiler engine generates an AVX512 instruction
> although I target an 'avx2' CPU (intel Core I7).
> I just downloaded the most recent 3.8 and still it happens.
>
> It happens with this input module:
>
>
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>
> define void @module_cFFEMJ(i64 %lo, i64 %hi, i64 %myId, i1 %ordered, i64
> %start, i32* noalias align 32 %arg0, i32* noalias align 32 %arg1) {
> entrypoint:
>   %0 = add nsw i64 %lo, %start
>   %1 = add nsw i64 %hi, %start
>   %2 = select i1 %ordered, i64 %0, i64 %lo
>   %3 = select i1 %ordered, i64 %1, i64 %hi
>   %4 = sdiv i64 %2, 4
>   %5 = sdiv i64 %3, 4
>   %6 = bitcast i32* %arg1 to i64*
>   %7 = load i64, i64* %6, align 32
>   %8 = trunc i64 %7 to i32
>   %9 = getelementptr i32, i32* %arg1, i64 1
>   %10 = lshr i64 %7, 32
>   %11 = trunc i64 %10 to i32
>   %12 = getelementptr i32, i32* %arg1, i64 2
>   %13 = bitcast i32* %12 to i64*
>   %14 = load i64, i64* %13, align 8
>   %15 = trunc i64 %14 to i32
>   %16 = getelementptr i32, i32* %arg1, i64 3
>   %17 = lshr i64 %14, 32
>   %18 = trunc i64 %17 to i32
>   br label %L5
>
> L5:                                               ; preds = %L5,
> %entrypoint
>   %19 = phi i64 [ %32, %L5 ], [ %4, %entrypoint ]
>   %20 = shl i64 %19, 4
>   %21 = or i64 %20, 4
>   %22 = or i64 %20, 8
>   %23 = or i64 %20, 12
>   %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %8,
i32 0
>   %broadcast.splat10 = shufflevector <4 x i32>
%broadcast.splatinsert9, <4
> x i32> undef, <4 x i32> zeroinitializer
>   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %11,
i32 0
>   %broadcast.splat12 = shufflevector <4 x i32>
%broadcast.splatinsert11,
> <4 x i32> undef, <4 x i32> zeroinitializer
>   %broadcast.splatinsert13 = insertelement <4 x i32> undef, i32 %15,
i32 0
>   %broadcast.splat14 = shufflevector <4 x i32>
%broadcast.splatinsert13,
> <4 x i32> undef, <4 x i32> zeroinitializer
>   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %18,
i32 0
>   %broadcast.splat16 = shufflevector <4 x i32>
%broadcast.splatinsert15,
> <4 x i32> undef, <4 x i32> zeroinitializer
>   %24 = getelementptr i32, i32* %arg0, i64 %20
>   %25 = bitcast i32* %24 to <4 x i32>*
>   store <4 x i32> %broadcast.splat10, <4 x i32>* %25, align 16
>   %26 = getelementptr i32, i32* %arg0, i64 %21
>   %27 = bitcast i32* %26 to <4 x i32>*
>   store <4 x i32> %broadcast.splat12, <4 x i32>* %27, align 16
>   %28 = getelementptr i32, i32* %arg0, i64 %22
>   %29 = bitcast i32* %28 to <4 x i32>*
>   store <4 x i32> %broadcast.splat14, <4 x i32>* %29, align 16
>   %30 = getelementptr i32, i32* %arg0, i64 %23
>   %31 = bitcast i32* %30 to <4 x i32>*
>   store <4 x i32> %broadcast.splat16, <4 x i32>* %31, align 16
>   %32 = add nsw i64 %19, 1
>   %33 = icmp slt i64 %32, %5
>   br i1 %33, label %L5, label %L6
>
> L6:                                               ; preds = %L5
>   ret void
> }
>
>
> The following code line show how I call the JIT compiler. ('Mod' is
> pointing to the module).
>
> llvm::EngineBuilder
> engineBuilder(std::move(std::unique_ptr<llvm::Module>(Mod)));
> engineBuilder.setMCPU(llvm::sys::getHostCPUName());
> engineBuilder.setEngineKind(llvm::EngineKind::JIT);
> engineBuilder.setOptLevel(llvm::CodeGenOpt::Aggressive);
> engineBuilder.setErrorStr(&mcjit_error);
>
> llvm::TargetOptions targetOptions;
> targetOptions.AllowFPOpFusion = llvm::FPOpFusion::Fast;
> engineBuilder.setTargetOptions( targetOptions );
>
> TheExecutionEngine = engineBuilder.create();
>
> targetMachine = engineBuilder.selectTarget();
> Mod->setDataLayout( targetMachine->createDataLayout() );
>
> TheExecutionEngine->finalizeObject();  // MCJIT
> fptr_mainFunc_extern = TheExecutionEngine->getPointerToFunction(
> mainFunc_extern );
>
>
> When calling the function an 'illegal instruction' is raised.
> Looking at the assembler reveals an AVX512 instruction which shouldn't
be
> there.
>
> Assembly:
>     .text
>     .file    "module"
>     .globl    main
>     .align    16, 0x90
>     .type    main, at function
> main:
>     .cfi_startproc
>     movq    8(%rsp), %r10
>     leaq    (%rdi,%r8), %rdx
>     addq    %rsi, %r8
>     testb    $1, %cl
>     cmoveq    %rdi, %rdx
>     cmoveq    %rsi, %r8
>     movq    %rdx, %rax
>     sarq    $63, %rax
>     shrq    $62, %rax
>     addq    %rdx, %rax
>     sarq    $2, %rax
>     movq    %r8, %rcx
>     sarq    $63, %rcx
>     shrq    $62, %rcx
>     addq    %r8, %rcx
>     sarq    $2, %rcx
>     movq    (%r10), %r8
>     movq    8(%r10), %r10
>     movq    %r8, %rdi
>     shrq    $32, %rdi
>     movq    %r10, %rsi
>     shrq    $32, %rsi
>     movq    %rax, %rdx
>     shlq    $6, %rdx
>     leaq    48(%rdx,%r9), %rdx
>     .align    16, 0x90
> .LBB0_1:
>     vmovd    %r8d, %xmm0
>     vpbroadcastd    %xmm0, %xmm0
>     vmovd    %edi, %xmm1
>     vpbroadcastd    %xmm1, %xmm1
>     vmovd    %r10d, %xmm2
>     vpbroadcastd    %xmm2, %xmm2
>     vmovd    %esi, %xmm3
>     vpbroadcastd    %xmm3, %xmm3
>     vmovdqa32    %xmm0, -48(%rdx)
>     vmovdqa32    %xmm1, -32(%rdx)
>     vmovdqa32    %xmm2, -16(%rdx)
>     vmovdqa32    %xmm3, (%rdx)
>     addq    $1, %rax
>     addq    $64, %rdx
>     cmpq    %rcx, %rax
>     jl    .LBB0_1
>     retq
> .Lfunc_end0:
>     .size    main, .Lfunc_end0-main
>     .cfi_endproc
>
>
>     .section    ".note.GNU-stack","", at progbits
>
> end assembly!
>
> I am not sure what instruction is the offending one, but the
'vmovdqa32'
> looks avx512.
>
> I wasn't able to reproduce this with 'opt' - it generates avx2
> instructions. And when I force it to use e.g. avx512f it rejects the CPU
> type.
>
> Any ideas?
>
>
> Frank
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>


-- 
~Craig
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160623/bfd3af4d/attachment.html>

Frank Winter via llvm-dev

2016-Jun-23 17:00 UTC

head link

[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture

On 06/23/2016 12:56 PM, Craig Topper wrote:> Can you check what value "getHostCPUName" returned?
getHostCPUName() = skylake>
> On Thu, Jun 23, 2016 at 9:53 AM, Frank Winter via llvm-dev 
> <llvm-dev at lists.llvm.org <mailto:llvm-dev at
lists.llvm.org>> wrote:
>
>     With LLVM 3.8 the JIT compiler engine generates an AVX512
>     instruction although I target an 'avx2' CPU (intel Core I7).
>     I just downloaded the most recent 3.8 and still it happens.
>
>     It happens with this input module:
>
>
>     target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>
>     define void @module_cFFEMJ(i64 %lo, i64 %hi, i64 %myId, i1
>     %ordered, i64 %start, i32* noalias align 32 %arg0, i32* noalias
>     align 32 %arg1) {
>     entrypoint:
>       %0 = add nsw i64 %lo, %start
>       %1 = add nsw i64 %hi, %start
>       %2 = select i1 %ordered, i64 %0, i64 %lo
>       %3 = select i1 %ordered, i64 %1, i64 %hi
>       %4 = sdiv i64 %2, 4
>       %5 = sdiv i64 %3, 4
>       %6 = bitcast i32* %arg1 to i64*
>       %7 = load i64, i64* %6, align 32
>       %8 = trunc i64 %7 to i32
>       %9 = getelementptr i32, i32* %arg1, i64 1
>       %10 = lshr i64 %7, 32
>       %11 = trunc i64 %10 to i32
>       %12 = getelementptr i32, i32* %arg1, i64 2
>       %13 = bitcast i32* %12 to i64*
>       %14 = load i64, i64* %13, align 8
>       %15 = trunc i64 %14 to i32
>       %16 = getelementptr i32, i32* %arg1, i64 3
>       %17 = lshr i64 %14, 32
>       %18 = trunc i64 %17 to i32
>       br label %L5
>
>     L5:                                               ; preds = %L5,
>     %entrypoint
>       %19 = phi i64 [ %32, %L5 ], [ %4, %entrypoint ]
>       %20 = shl i64 %19, 4
>       %21 = or i64 %20, 4
>       %22 = or i64 %20, 8
>       %23 = or i64 %20, 12
>       %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32
%8,
>     i32 0
>       %broadcast.splat10 = shufflevector <4 x i32>
>     %broadcast.splatinsert9, <4 x i32> undef, <4 x i32>
zeroinitializer
>       %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32
>     %11, i32 0
>       %broadcast.splat12 = shufflevector <4 x i32>
>     %broadcast.splatinsert11, <4 x i32> undef, <4 x i32>
zeroinitializer
>       %broadcast.splatinsert13 = insertelement <4 x i32> undef, i32
>     %15, i32 0
>       %broadcast.splat14 = shufflevector <4 x i32>
>     %broadcast.splatinsert13, <4 x i32> undef, <4 x i32>
zeroinitializer
>       %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32
>     %18, i32 0
>       %broadcast.splat16 = shufflevector <4 x i32>
>     %broadcast.splatinsert15, <4 x i32> undef, <4 x i32>
zeroinitializer
>       %24 = getelementptr i32, i32* %arg0, i64 %20
>       %25 = bitcast i32* %24 to <4 x i32>*
>       store <4 x i32> %broadcast.splat10, <4 x i32>* %25, align
16
>       %26 = getelementptr i32, i32* %arg0, i64 %21
>       %27 = bitcast i32* %26 to <4 x i32>*
>       store <4 x i32> %broadcast.splat12, <4 x i32>* %27, align
16
>       %28 = getelementptr i32, i32* %arg0, i64 %22
>       %29 = bitcast i32* %28 to <4 x i32>*
>       store <4 x i32> %broadcast.splat14, <4 x i32>* %29, align
16
>       %30 = getelementptr i32, i32* %arg0, i64 %23
>       %31 = bitcast i32* %30 to <4 x i32>*
>       store <4 x i32> %broadcast.splat16, <4 x i32>* %31, align
16
>       %32 = add nsw i64 %19, 1
>       %33 = icmp slt i64 %32, %5
>       br i1 %33, label %L5, label %L6
>
>     L6:                                               ; preds = %L5
>       ret void
>     }
>
>
>     The following code line show how I call the JIT compiler.
('Mod'
>     is pointing to the module).
>
>     llvm::EngineBuilder
>     engineBuilder(std::move(std::unique_ptr<llvm::Module>(Mod)));
>     engineBuilder.setMCPU(llvm::sys::getHostCPUName());
>     engineBuilder.setEngineKind(llvm::EngineKind::JIT);
>     engineBuilder.setOptLevel(llvm::CodeGenOpt::Aggressive);
>     engineBuilder.setErrorStr(&mcjit_error);
>
>     llvm::TargetOptions targetOptions;
>     targetOptions.AllowFPOpFusion = llvm::FPOpFusion::Fast;
>     engineBuilder.setTargetOptions( targetOptions );
>
>     TheExecutionEngine = engineBuilder.create();
>
>     targetMachine = engineBuilder.selectTarget();
>     Mod->setDataLayout( targetMachine->createDataLayout() );
>
>     TheExecutionEngine->finalizeObject();  // MCJIT
>     fptr_mainFunc_extern = TheExecutionEngine->getPointerToFunction(
>     mainFunc_extern );
>
>
>     When calling the function an 'illegal instruction' is raised.
>     Looking at the assembler reveals an AVX512 instruction which
>     shouldn't be there.
>
>     Assembly:
>         .text
>         .file    "module"
>         .globl    main
>         .align    16, 0x90
>         .type    main, at function
>     main:
>         .cfi_startproc
>         movq    8(%rsp), %r10
>         leaq    (%rdi,%r8), %rdx
>         addq    %rsi, %r8
>         testb    $1, %cl
>         cmoveq    %rdi, %rdx
>         cmoveq    %rsi, %r8
>         movq    %rdx, %rax
>         sarq    $63, %rax
>         shrq    $62, %rax
>         addq    %rdx, %rax
>         sarq    $2, %rax
>         movq    %r8, %rcx
>         sarq    $63, %rcx
>         shrq    $62, %rcx
>         addq    %r8, %rcx
>         sarq    $2, %rcx
>         movq    (%r10), %r8
>         movq    8(%r10), %r10
>         movq    %r8, %rdi
>         shrq    $32, %rdi
>         movq    %r10, %rsi
>         shrq    $32, %rsi
>         movq    %rax, %rdx
>         shlq    $6, %rdx
>         leaq    48(%rdx,%r9), %rdx
>         .align    16, 0x90
>     .LBB0_1:
>         vmovd    %r8d, %xmm0
>         vpbroadcastd    %xmm0, %xmm0
>         vmovd    %edi, %xmm1
>         vpbroadcastd    %xmm1, %xmm1
>         vmovd    %r10d, %xmm2
>         vpbroadcastd    %xmm2, %xmm2
>         vmovd    %esi, %xmm3
>         vpbroadcastd    %xmm3, %xmm3
>         vmovdqa32    %xmm0, -48(%rdx)
>         vmovdqa32    %xmm1, -32(%rdx)
>         vmovdqa32    %xmm2, -16(%rdx)
>         vmovdqa32    %xmm3, (%rdx)
>         addq    $1, %rax
>         addq    $64, %rdx
>         cmpq    %rcx, %rax
>         jl    .LBB0_1
>         retq
>     .Lfunc_end0:
>         .size    main, .Lfunc_end0-main
>         .cfi_endproc
>
>
>         .section    ".note.GNU-stack","", at progbits
>
>     end assembly!
>
>     I am not sure what instruction is the offending one, but the
>     'vmovdqa32' looks avx512.
>
>     I wasn't able to reproduce this with 'opt' - it generates
avx2
>     instructions. And when I force it to use e.g. avx512f it rejects
>     the CPU type.
>
>     Any ideas?
>
>
>     Frank
>     _______________________________________________
>     LLVM Developers mailing list
>     llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>
>     http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
>
>
>
> -- 
> ~Craig
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160623/98510675/attachment.html>

llvm dev - Jun 2016 - AVX512 instruction generated when JIT compiling for an avx2 architecture

[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture

[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture

[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture