thr3ads.net - llvm dev - [llvm-dev] avx512 JIT backend generates wrong code on <4 x float> [Jun 2016]

If this information is useful, please help other people find it:
Share via:

Frank Winter via llvm-dev

2016-Jun-29 19:41 UTC

[llvm-dev] avx512 JIT backend generates wrong code on <4 x float>

Hi!

When compiling the attached module with the JIT engine on an Intel KNL I 
see wrong code getting emitted. I attach a complete exploit program 
which shows the bug in LLVM 3.8. It loads and JIT compiles the module 
and prints the assembler. I stumbled on this since the result of an 
actual calculation was wrong. So, it's not only the text version of the 
assembler also the machine assembler is wrong.

When I execute the exploit program on an Intel KNL the following output 
is produced:

CPU name = knl
-sse4a,-avx512bw,cx16,-tbm,xsave,-fma4,-avx512vl,prfchw,bmi2,adx,-xsavec,fsgsbase,avx,avx512cd,avx512pf,-rtm,popcnt,fma,bmi,aes,rdrnd,-xsaves,sse4.1,sse4.2,avx2,avx512er,sse,lzcnt,pclmul,avx512f,f16c,ssse3,mmx,-pku,cmov,-xop,rdseed,movbe,-hle,xsaveopt,-sha,sse2,sse3,-avx512dq,
Assembly:
     .text
     .file    "module_KFxOBX_i4_after.ll"
     .globl    adjmul
     .align    16, 0x90
     .type    adjmul, at function
adjmul:
     .cfi_startproc
     leaq    (%rdi,%r8), %rdx
     addq    %rsi, %r8
     testb    $1, %cl
     cmoveq    %rdi, %rdx
     cmoveq    %rsi, %r8
     movq    %rdx, %rax
     sarq    $63, %rax
     shrq    $62, %rax
     addq    %rdx, %rax
     sarq    $2, %rax
     movq    %r8, %rcx
     sarq    $63, %rcx
     shrq    $62, %rcx
     addq    %r8, %rcx
     sarq    $2, %rcx
     movq    %rax, %rdx
     shlq    $5, %rdx
     leaq    16(%r9,%rdx), %rsi
     orq    $16, %rdx
     movq    16(%rsp), %rdi
     addq    %rdx, %rdi
     addq    8(%rsp), %rdx
     .align    16, 0x90
.LBB0_1:
     vmovaps    -16(%rdx), %xmm0
     vmovaps    (%rdx), %xmm1
     vmovaps    -16(%rdi), %xmm2
     vmovaps    (%rdi), %xmm3
     vmulps    %xmm3, %xmm1, %xmm4
     vmulps    %xmm2, %xmm1, %xmm1
     vfmadd213ss    %xmm4, %xmm0, %xmm2
     vfmsub213ss    %xmm1, %xmm0, %xmm3
     vmovaps    %xmm2, -16(%rsi)
     vmovaps    %xmm3, (%rsi)
     addq    $1, %rax
     addq    $32, %rsi
     addq    $32, %rdi
     addq    $32, %rdx
     cmpq    %rcx, %rax
     jl    .LBB0_1
     retq
.Lfunc_end0:
     .size    adjmul, .Lfunc_end0-adjmul
     .cfi_endproc


     .section    ".note.GNU-stack","", at progbits

end assembly!


The instructions 'vfmadd213ss' are 'Fused Multiply-Add of Scalar 
Single-Precision Floating-Point'. Those should be SIMD vector 
instructions. Note that the KNL has 16 wide float SIMD, while the 
exploit module uses only 4. However, the backend should be able to 
handle this.

Unless I receive further ideas I will file an official bug report.

Frank

-------------- next part --------------
LLVMPATH=/home/fwinter/toolchain/install/llvm-3.8-recent

LLVMCONF=$(LLVMPATH)/bin/llvm-config
CXXFLAGS=$(shell $(LLVMCONF) --cxxflags)
LIBS=$(shell $(LLVMCONF) --libs core mcjit native scalaropts vectorize irreader
linker) -ldl
LDFLAGS=$(shell $(LLVMCONF) --ldflags)


CXX=g++

OBJS=main.o

TARGET=main

all: $(TARGET)

main: $(OBJS)
	$(CXX) -g -o $@ $^ $(LIBS)  $(LDFLAGS) 

%.o: %.cc
	$(CXX) $(CXXFLAGS) -c -g $<


clean:
	rm -rf $(TARGET) $(OBJS) *~


-------------- next part --------------
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

define void @adjmul(i64 %lo, i64 %hi, i64 %myId, i1 %ordered, i64 %start, float*
noalias align 64 %arg0, float* noalias align 64 %arg1, float* noalias align 64
%arg2) {
entrypoint:
  %0 = add nsw i64 %lo, %start
  %1 = add nsw i64 %hi, %start
  %2 = select i1 %ordered, i64 %0, i64 %lo
  %3 = select i1 %ordered, i64 %1, i64 %hi
  %4 = sdiv i64 %2, 4
  %5 = sdiv i64 %3, 4
  br label %L5

L5:                                               ; preds = %L5, %entrypoint
  %6 = phi i64 [ %27, %L5 ], [ %4, %entrypoint ]
  %7 = shl i64 %6, 3
  %8 = or i64 %7, 4
  %9 = getelementptr float, float* %arg1, i64 %7
  %10 = bitcast float* %9 to <4 x float>*
  %wide.load = load <4 x float>, <4 x float>* %10, align 16
  %11 = getelementptr float, float* %arg1, i64 %8
  %12 = bitcast float* %11 to <4 x float>*
  %wide.load5 = load <4 x float>, <4 x float>* %12, align 16
  %13 = getelementptr float, float* %arg2, i64 %7
  %14 = bitcast float* %13 to <4 x float>*
  %wide.load6 = load <4 x float>, <4 x float>* %14, align 16
  %15 = getelementptr float, float* %arg2, i64 %8
  %16 = bitcast float* %15 to <4 x float>*
  %wide.load7 = load <4 x float>, <4 x float>* %16, align 16
  %17 = fmul <4 x float> %wide.load, %wide.load6
  %18 = fmul <4 x float> %wide.load5, %wide.load7
  %19 = fadd <4 x float> %17, %18
  %20 = fmul <4 x float> %wide.load, %wide.load7
  %21 = fmul <4 x float> %wide.load5, %wide.load6
  %22 = fsub <4 x float> %20, %21
  %23 = getelementptr float, float* %arg0, i64 %7
  %24 = bitcast float* %23 to <4 x float>*
  store <4 x float> %19, <4 x float>* %24, align 16
  %25 = getelementptr float, float* %arg0, i64 %8
  %26 = bitcast float* %25 to <4 x float>*
  store <4 x float> %22, <4 x float>* %26, align 16
  %27 = add nsw i64 %6, 1
  %28 = icmp slt i64 %27, %5
  br i1 %28, label %L5, label %L6

L6:                                               ; preds = %L5
  ret void
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: main.cc
Type: text/x-c++src
Size: 5059 bytes
Desc: not available
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160629/8008aa24/attachment.cc>

Hal Finkel via llvm-dev

2016-Jun-29 19:48 UTC

head link

[llvm-dev] avx512 JIT backend generates wrong code on <4 x float>

Hi Frank,

I recommend trying trunk LLVM. AVX-512 development has been very active
recently.

 -Hal

----- Original Message -----> From: "Frank Winter via llvm-dev" <llvm-dev at
lists.llvm.org>
> To: "LLVM Dev" <llvm-dev at lists.llvm.org>
> Sent: Wednesday, June 29, 2016 2:41:39 PM
> Subject: [llvm-dev] avx512 JIT backend generates wrong code on <4 x
float>
> 
> Hi!
> 
> When compiling the attached module with the JIT engine on an Intel
> KNL I
> see wrong code getting emitted. I attach a complete exploit program
> which shows the bug in LLVM 3.8. It loads and JIT compiles the module
> and prints the assembler. I stumbled on this since the result of an
> actual calculation was wrong. So, it's not only the text version of
> the
> assembler also the machine assembler is wrong.
> 
> When I execute the exploit program on an Intel KNL the following
> output
> is produced:
> 
> CPU name = knl
>
-sse4a,-avx512bw,cx16,-tbm,xsave,-fma4,-avx512vl,prfchw,bmi2,adx,-xsavec,fsgsbase,avx,avx512cd,avx512pf,-rtm,popcnt,fma,bmi,aes,rdrnd,-xsaves,sse4.1,sse4.2,avx2,avx512er,sse,lzcnt,pclmul,avx512f,f16c,ssse3,mmx,-pku,cmov,-xop,rdseed,movbe,-hle,xsaveopt,-sha,sse2,sse3,-avx512dq,
> Assembly:
>      .text
>      .file    "module_KFxOBX_i4_after.ll"
>      .globl    adjmul
>      .align    16, 0x90
>      .type    adjmul, at function
> adjmul:
>      .cfi_startproc
>      leaq    (%rdi,%r8), %rdx
>      addq    %rsi, %r8
>      testb    $1, %cl
>      cmoveq    %rdi, %rdx
>      cmoveq    %rsi, %r8
>      movq    %rdx, %rax
>      sarq    $63, %rax
>      shrq    $62, %rax
>      addq    %rdx, %rax
>      sarq    $2, %rax
>      movq    %r8, %rcx
>      sarq    $63, %rcx
>      shrq    $62, %rcx
>      addq    %r8, %rcx
>      sarq    $2, %rcx
>      movq    %rax, %rdx
>      shlq    $5, %rdx
>      leaq    16(%r9,%rdx), %rsi
>      orq    $16, %rdx
>      movq    16(%rsp), %rdi
>      addq    %rdx, %rdi
>      addq    8(%rsp), %rdx
>      .align    16, 0x90
> .LBB0_1:
>      vmovaps    -16(%rdx), %xmm0
>      vmovaps    (%rdx), %xmm1
>      vmovaps    -16(%rdi), %xmm2
>      vmovaps    (%rdi), %xmm3
>      vmulps    %xmm3, %xmm1, %xmm4
>      vmulps    %xmm2, %xmm1, %xmm1
>      vfmadd213ss    %xmm4, %xmm0, %xmm2
>      vfmsub213ss    %xmm1, %xmm0, %xmm3
>      vmovaps    %xmm2, -16(%rsi)
>      vmovaps    %xmm3, (%rsi)
>      addq    $1, %rax
>      addq    $32, %rsi
>      addq    $32, %rdi
>      addq    $32, %rdx
>      cmpq    %rcx, %rax
>      jl    .LBB0_1
>      retq
> .Lfunc_end0:
>      .size    adjmul, .Lfunc_end0-adjmul
>      .cfi_endproc
> 
> 
>      .section    ".note.GNU-stack","", at progbits
> 
> end assembly!
> 
> 
> The instructions 'vfmadd213ss' are 'Fused Multiply-Add of
Scalar
> Single-Precision Floating-Point'. Those should be SIMD vector
> instructions. Note that the KNL has 16 wide float SIMD, while the
> exploit module uses only 4. However, the backend should be able to
> handle this.
> 
> Unless I receive further ideas I will file an official bug report.
> 
> Frank
> 
> 
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
> 
-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory

Frank Winter via llvm-dev

2016-Jun-30 16:49 UTC

head link

[llvm-dev] avx512 JIT backend generates wrong code on <4 x float>

Hi Hal!

Thanks, but unfortunately it didn't help. The exact same assembler 
instructions are generated for both 3.8 (yesterday) and trunk (from today).

So, this really looks like a bug.

Best,
Frank

On 06/29/2016 03:48 PM, Hal Finkel wrote:> Hi Frank,
>
> I recommend trying trunk LLVM. AVX-512 development has been very active
recently.
>
>   -Hal
>
> ----- Original Message -----
>> From: "Frank Winter via llvm-dev" <llvm-dev at
lists.llvm.org>
>> To: "LLVM Dev" <llvm-dev at lists.llvm.org>
>> Sent: Wednesday, June 29, 2016 2:41:39 PM
>> Subject: [llvm-dev] avx512 JIT backend generates wrong code on <4 x
float>
>>
>> Hi!
>>
>> When compiling the attached module with the JIT engine on an Intel
>> KNL I
>> see wrong code getting emitted. I attach a complete exploit program
>> which shows the bug in LLVM 3.8. It loads and JIT compiles the module
>> and prints the assembler. I stumbled on this since the result of an
>> actual calculation was wrong. So, it's not only the text version of
>> the
>> assembler also the machine assembler is wrong.
>>
>> When I execute the exploit program on an Intel KNL the following
>> output
>> is produced:
>>
>> CPU name = knl
>>
-sse4a,-avx512bw,cx16,-tbm,xsave,-fma4,-avx512vl,prfchw,bmi2,adx,-xsavec,fsgsbase,avx,avx512cd,avx512pf,-rtm,popcnt,fma,bmi,aes,rdrnd,-xsaves,sse4.1,sse4.2,avx2,avx512er,sse,lzcnt,pclmul,avx512f,f16c,ssse3,mmx,-pku,cmov,-xop,rdseed,movbe,-hle,xsaveopt,-sha,sse2,sse3,-avx512dq,
>> Assembly:
>>       .text
>>       .file    "module_KFxOBX_i4_after.ll"
>>       .globl    adjmul
>>       .align    16, 0x90
>>       .type    adjmul, at function
>> adjmul:
>>       .cfi_startproc
>>       leaq    (%rdi,%r8), %rdx
>>       addq    %rsi, %r8
>>       testb    $1, %cl
>>       cmoveq    %rdi, %rdx
>>       cmoveq    %rsi, %r8
>>       movq    %rdx, %rax
>>       sarq    $63, %rax
>>       shrq    $62, %rax
>>       addq    %rdx, %rax
>>       sarq    $2, %rax
>>       movq    %r8, %rcx
>>       sarq    $63, %rcx
>>       shrq    $62, %rcx
>>       addq    %r8, %rcx
>>       sarq    $2, %rcx
>>       movq    %rax, %rdx
>>       shlq    $5, %rdx
>>       leaq    16(%r9,%rdx), %rsi
>>       orq    $16, %rdx
>>       movq    16(%rsp), %rdi
>>       addq    %rdx, %rdi
>>       addq    8(%rsp), %rdx
>>       .align    16, 0x90
>> .LBB0_1:
>>       vmovaps    -16(%rdx), %xmm0
>>       vmovaps    (%rdx), %xmm1
>>       vmovaps    -16(%rdi), %xmm2
>>       vmovaps    (%rdi), %xmm3
>>       vmulps    %xmm3, %xmm1, %xmm4
>>       vmulps    %xmm2, %xmm1, %xmm1
>>       vfmadd213ss    %xmm4, %xmm0, %xmm2
>>       vfmsub213ss    %xmm1, %xmm0, %xmm3
>>       vmovaps    %xmm2, -16(%rsi)
>>       vmovaps    %xmm3, (%rsi)
>>       addq    $1, %rax
>>       addq    $32, %rsi
>>       addq    $32, %rdi
>>       addq    $32, %rdx
>>       cmpq    %rcx, %rax
>>       jl    .LBB0_1
>>       retq
>> .Lfunc_end0:
>>       .size    adjmul, .Lfunc_end0-adjmul
>>       .cfi_endproc
>>
>>
>>       .section    ".note.GNU-stack","", at progbits
>>
>> end assembly!
>>
>>
>> The instructions 'vfmadd213ss' are 'Fused Multiply-Add of
Scalar
>> Single-Precision Floating-Point'. Those should be SIMD vector
>> instructions. Note that the KNL has 16 wide float SIMD, while the
>> exploit module uses only 4. However, the backend should be able to
>> handle this.
>>
>> Unless I receive further ideas I will file an official bug report.
>>
>> Frank
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>

Seemingly Similar Threads

Search for more possibly parallel threads

llvm dev - Jun 2016 - avx512 JIT backend generates wrong code on <4 x float>

[llvm-dev] avx512 JIT backend generates wrong code on <4 x float>

[llvm-dev] avx512 JIT backend generates wrong code on <4 x float>

[llvm-dev] avx512 JIT backend generates wrong code on <4 x float>

Seemingly Similar Threads