thr3ads.net - llvm dev - [LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012 [Aug 2013]

If this information is useful, please help other people find it:
Share via:

Kévin Szkudlapski

2013-Aug-19 19:21 UTC

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

Hi,

I'm using LLVM to convert expressions to native assembly, the problem
is when LLVM compiles this code:

define void @fn_0000000000000000(i8*, i8*, i8*) {
bb:
  %res = alloca i32
  %3 = load i32* %res
  %4 = bitcast i8* %0 to i32*
  %5 = load i32* %4
  %6 = bitcast i8* %0 to i32*
  %7 = load i32* %6
  %8 = xor i32 %5, %7
  store volatile i32 %8, i32* %res
  %9 = load i32* %res
  %10 = icmp eq i32 %9, 0
  br i1 %10, label %then, label %else

merged:                                           ; preds = %else, %then
  %11 = load i32* %res
  %12 = and i32 %11, -2147483648
  %13 = icmp eq i32 %12, 0
  br i1 %13, label %then3, label %else4

then:                                             ; preds = %bb
  %zf = alloca i1
  %14 = load i1* %zf
  %15 = getelementptr i8* %0, i32 148
  %16 = bitcast i8* %15 to i1*
  %17 = load i1* %16
  store volatile i1 true, i1* %16
  br label %merged

else:                                             ; preds = %bb
  %zf1 = alloca i1
  %18 = load i1* %zf1
  %19 = getelementptr i8* %0, i32 148
  %20 = bitcast i8* %19 to i1*
  %21 = load i1* %20
  store volatile i1 false, i1* %20
  br label %merged

merged2:                                          ; preds = %else4, %then3
  %22 = bitcast i8* %0 to i32*
  %23 = load i32* %22
  %24 = load i32* %res
  store volatile i32 %24, i32* %22
  %af = alloca i1
  %25 = load i1* %af
  %26 = getelementptr i8* %0, i32 148
  %27 = bitcast i8* %26 to i1*
  %28 = load i1* %27
  store volatile i1 false, i1* %27
  %of = alloca i1
  %29 = load i1* %of
  %30 = getelementptr i8* %0, i32 148
  %31 = bitcast i8* %30 to i1*
  %32 = load i1* %31
  store volatile i1 false, i1* %31
  %cf = alloca i1
  %33 = load i1* %cf
  %34 = getelementptr i8* %0, i32 148
  %35 = bitcast i8* %34 to i1*
  %36 = load i1* %35
  store volatile i1 false, i1* %35
  %37 = getelementptr i8* %0, i32 64
  %38 = bitcast i8* %37 to i32*
  %39 = load i32* %38
  %40 = getelementptr i8* %0, i32 64
  %41 = bitcast i8* %40 to i32*
  %42 = load i32* %41
  %43 = add i32 %42, 2
  store volatile i32 %43, i32* %38
  ret void

then3:                                            ; preds = %merged
  %sf = alloca i1
  %44 = load i1* %sf
  %45 = getelementptr i8* %0, i32 148
  %46 = bitcast i8* %45 to i1*
  %47 = load i1* %46
  store volatile i1 false, i1* %46
  br label %merged2

else4:                                            ; preds = %merged
  %sf5 = alloca i1
  %48 = load i1* %sf5
  %49 = getelementptr i8* %0, i32 148
  %50 = bitcast i8* %49 to i1*
  %51 = load i1* %50
  store volatile i1 true, i1* %50
  br label %merged2
}


It generates the following assembly:
0000000581D30010  push        rbp
0000000581D30011  mov         rbp,rsp
0000000581D30014  sub         rsp,10h
0000000581D30018  mov         dword ptr [rbp-4],0
0000000581D3001F  mov         al,1
0000000581D30021  test        al,al
0000000581D30023  jne         0000000581D30042
0000000581D30029  mov         eax,10h
0000000581D3002E  call        00000005F08425D0
0000000581D30033  sub         rsp,rax
0000000581D30036  mov         byte ptr [rcx+94h],0
0000000581D3003D  jmp         0000000581D30056
0000000581D30042  mov         eax,10h
0000000581D30047  call        00000005F08425D0
0000000581D3004C  sub         rsp,rax
0000000581D3004F  mov         byte ptr [rcx+94h],1
0000000581D30056  test        byte ptr [rbp-1],80h
0000000581D3005A  je          0000000581D30079
0000000581D30060  mov         eax,10h
0000000581D30065  call        00000005F08425D0
0000000581D3006A  sub         rsp,rax
0000000581D3006D  mov         byte ptr [rcx+94h],1
0000000581D30074  jmp         0000000581D3008D
0000000581D30079  mov         eax,10h
0000000581D3007E  call        00000005F08425D0
0000000581D30083  sub         rsp,rax
0000000581D30086  mov         byte ptr [rcx+94h],0
0000000581D3008D  mov         eax,dword ptr [rbp-4]
0000000581D30090  mov         dword ptr [rcx],eax
0000000581D30092  mov         eax,10h
0000000581D30097  call        00000005F08425D0
0000000581D3009C  sub         rsp,rax
0000000581D3009F  mov         byte ptr [rcx+94h],0
0000000581D300A6  mov         eax,10h
0000000581D300AB  call        00000005F08425D0
0000000581D300B0  sub         rsp,rax
0000000581D300B3  mov         byte ptr [rcx+94h],0
0000000581D300BA  mov         eax,10h
0000000581D300BF  call        00000005F08425D0
0000000581D300C4  sub         rsp,rax
0000000581D300C7  mov         byte ptr [rcx+94h],0
0000000581D300CE  add         dword ptr [rcx+40h],2
0000000581D300D2  mov         rsp,rbp
0000000581D300D5  pop         rbp
0000000581D300D6  ret

The function located at 0x00000005F08425D0 is not valid (according to
visual studio: 00000005F08425D0  ?? ??).

If I compile LLVM bytecode using llc, this function is __chkstk:
        .def     fn_0000000000000000;
        .scl    2;
        .type   32;
        .endef
        .text
        .globl  fn_0000000000000000
        .align  16, 0x90
fn_0000000000000000:                    # @fn_0000000000000000
# BB#0:                                 # %bb
        push    rbp
        mov     rbp, rsp
        sub     rsp, 16
        mov     dword ptr [rbp - 4], 0
        mov     al, 1
        test    al, al
        jne     .LBB0_1
# BB#2:                                 # %else
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 0
        jmp     .LBB0_3
.LBB0_1:                                # %then
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 1
.LBB0_3:                                # %merged
        test    byte ptr [rbp - 1], -128
        je      .LBB0_4
# BB#5:                                 # %else4
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 1
        jmp     .LBB0_6
.LBB0_4:                                # %then3
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 0
.LBB0_6:                                # %merged2
        mov     eax, dword ptr [rbp - 4]
        mov     dword ptr [rcx], eax
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 0
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 0
        mov     eax, 16
        call    __chkstk
        sub     rsp, rax
        mov     byte ptr [rcx + 148], 0
        add     dword ptr [rcx + 64], 2
        mov     rsp, rbp
        pop     rbp
        ret

It seems this issue has already been described here
https://groups.google.com/forum/#!topic/llvm-commit/htNjwbWsNe8

I'm using this code
https://github.com/wisk/medusa/blob/master/src/emul/llvm/llvm_emulator.cpp
which is pretty basic.

Please, tell me if you need further information about this issue.

-- 
Kevin Szkudlapski

Michael Lewis

2013-Aug-27 03:18 UTC

head link

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

It's not a solution to the actual bug (which is, as the thread you linked
discusses, a problem with the assumption on LLVM's part that the __chkstk
function lies within 2GB of the emitted code's address space) but there is
a simple workaround: hoist all allocas to the first basic block of your
function. This allows the JIT to perform all stack allocations in a single
adjustment of the SP instead of needing to use dynamic stack allocation,
and thereby avoids the call to __chkstk entirely.


On Mon, Aug 19, 2013 at 12:21 PM, Kévin Szkudlapski <szkudl.k at
gmail.com>wrote:
> Hi,
>
> I'm using LLVM to convert expressions to native assembly, the problem
> is when LLVM compiles this code:
>
> define void @fn_0000000000000000(i8*, i8*, i8*) {
> bb:
>   %res = alloca i32
>   %3 = load i32* %res
>   %4 = bitcast i8* %0 to i32*
>   %5 = load i32* %4
>   %6 = bitcast i8* %0 to i32*
>   %7 = load i32* %6
>   %8 = xor i32 %5, %7
>   store volatile i32 %8, i32* %res
>   %9 = load i32* %res
>   %10 = icmp eq i32 %9, 0
>   br i1 %10, label %then, label %else
>
> merged:                                           ; preds = %else, %then
>   %11 = load i32* %res
>   %12 = and i32 %11, -2147483648
>   %13 = icmp eq i32 %12, 0
>   br i1 %13, label %then3, label %else4
>
> then:                                             ; preds = %bb
>   %zf = alloca i1
>   %14 = load i1* %zf
>   %15 = getelementptr i8* %0, i32 148
>   %16 = bitcast i8* %15 to i1*
>   %17 = load i1* %16
>   store volatile i1 true, i1* %16
>   br label %merged
>
> else:                                             ; preds = %bb
>   %zf1 = alloca i1
>   %18 = load i1* %zf1
>   %19 = getelementptr i8* %0, i32 148
>   %20 = bitcast i8* %19 to i1*
>   %21 = load i1* %20
>   store volatile i1 false, i1* %20
>   br label %merged
>
> merged2:                                          ; preds = %else4, %then3
>   %22 = bitcast i8* %0 to i32*
>   %23 = load i32* %22
>   %24 = load i32* %res
>   store volatile i32 %24, i32* %22
>   %af = alloca i1
>   %25 = load i1* %af
>   %26 = getelementptr i8* %0, i32 148
>   %27 = bitcast i8* %26 to i1*
>   %28 = load i1* %27
>   store volatile i1 false, i1* %27
>   %of = alloca i1
>   %29 = load i1* %of
>   %30 = getelementptr i8* %0, i32 148
>   %31 = bitcast i8* %30 to i1*
>   %32 = load i1* %31
>   store volatile i1 false, i1* %31
>   %cf = alloca i1
>   %33 = load i1* %cf
>   %34 = getelementptr i8* %0, i32 148
>   %35 = bitcast i8* %34 to i1*
>   %36 = load i1* %35
>   store volatile i1 false, i1* %35
>   %37 = getelementptr i8* %0, i32 64
>   %38 = bitcast i8* %37 to i32*
>   %39 = load i32* %38
>   %40 = getelementptr i8* %0, i32 64
>   %41 = bitcast i8* %40 to i32*
>   %42 = load i32* %41
>   %43 = add i32 %42, 2
>   store volatile i32 %43, i32* %38
>   ret void
>
> then3:                                            ; preds = %merged
>   %sf = alloca i1
>   %44 = load i1* %sf
>   %45 = getelementptr i8* %0, i32 148
>   %46 = bitcast i8* %45 to i1*
>   %47 = load i1* %46
>   store volatile i1 false, i1* %46
>   br label %merged2
>
> else4:                                            ; preds = %merged
>   %sf5 = alloca i1
>   %48 = load i1* %sf5
>   %49 = getelementptr i8* %0, i32 148
>   %50 = bitcast i8* %49 to i1*
>   %51 = load i1* %50
>   store volatile i1 true, i1* %50
>   br label %merged2
> }
>
>
> It generates the following assembly:
> 0000000581D30010  push        rbp
> 0000000581D30011  mov         rbp,rsp
> 0000000581D30014  sub         rsp,10h
> 0000000581D30018  mov         dword ptr [rbp-4],0
> 0000000581D3001F  mov         al,1
> 0000000581D30021  test        al,al
> 0000000581D30023  jne         0000000581D30042
> 0000000581D30029  mov         eax,10h
> 0000000581D3002E  call        00000005F08425D0
> 0000000581D30033  sub         rsp,rax
> 0000000581D30036  mov         byte ptr [rcx+94h],0
> 0000000581D3003D  jmp         0000000581D30056
> 0000000581D30042  mov         eax,10h
> 0000000581D30047  call        00000005F08425D0
> 0000000581D3004C  sub         rsp,rax
> 0000000581D3004F  mov         byte ptr [rcx+94h],1
> 0000000581D30056  test        byte ptr [rbp-1],80h
> 0000000581D3005A  je          0000000581D30079
> 0000000581D30060  mov         eax,10h
> 0000000581D30065  call        00000005F08425D0
> 0000000581D3006A  sub         rsp,rax
> 0000000581D3006D  mov         byte ptr [rcx+94h],1
> 0000000581D30074  jmp         0000000581D3008D
> 0000000581D30079  mov         eax,10h
> 0000000581D3007E  call        00000005F08425D0
> 0000000581D30083  sub         rsp,rax
> 0000000581D30086  mov         byte ptr [rcx+94h],0
> 0000000581D3008D  mov         eax,dword ptr [rbp-4]
> 0000000581D30090  mov         dword ptr [rcx],eax
> 0000000581D30092  mov         eax,10h
> 0000000581D30097  call        00000005F08425D0
> 0000000581D3009C  sub         rsp,rax
> 0000000581D3009F  mov         byte ptr [rcx+94h],0
> 0000000581D300A6  mov         eax,10h
> 0000000581D300AB  call        00000005F08425D0
> 0000000581D300B0  sub         rsp,rax
> 0000000581D300B3  mov         byte ptr [rcx+94h],0
> 0000000581D300BA  mov         eax,10h
> 0000000581D300BF  call        00000005F08425D0
> 0000000581D300C4  sub         rsp,rax
> 0000000581D300C7  mov         byte ptr [rcx+94h],0
> 0000000581D300CE  add         dword ptr [rcx+40h],2
> 0000000581D300D2  mov         rsp,rbp
> 0000000581D300D5  pop         rbp
> 0000000581D300D6  ret
>
> The function located at 0x00000005F08425D0 is not valid (according to
> visual studio: 00000005F08425D0  ?? ??).
>
> If I compile LLVM bytecode using llc, this function is __chkstk:
>         .def     fn_0000000000000000;
>         .scl    2;
>         .type   32;
>         .endef
>         .text
>         .globl  fn_0000000000000000
>         .align  16, 0x90
> fn_0000000000000000:                    # @fn_0000000000000000
> # BB#0:                                 # %bb
>         push    rbp
>         mov     rbp, rsp
>         sub     rsp, 16
>         mov     dword ptr [rbp - 4], 0
>         mov     al, 1
>         test    al, al
>         jne     .LBB0_1
> # BB#2:                                 # %else
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 0
>         jmp     .LBB0_3
> .LBB0_1:                                # %then
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 1
> .LBB0_3:                                # %merged
>         test    byte ptr [rbp - 1], -128
>         je      .LBB0_4
> # BB#5:                                 # %else4
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 1
>         jmp     .LBB0_6
> .LBB0_4:                                # %then3
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 0
> .LBB0_6:                                # %merged2
>         mov     eax, dword ptr [rbp - 4]
>         mov     dword ptr [rcx], eax
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 0
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 0
>         mov     eax, 16
>         call    __chkstk
>         sub     rsp, rax
>         mov     byte ptr [rcx + 148], 0
>         add     dword ptr [rcx + 64], 2
>         mov     rsp, rbp
>         pop     rbp
>         ret
>
> It seems this issue has already been described here
> https://groups.google.com/forum/#!topic/llvm-commit/htNjwbWsNe8
>
> I'm using this code
> https://github.com/wisk/medusa/blob/master/src/emul/llvm/llvm_emulator.cpp
> which is pretty basic.
>
> Please, tell me if you need further information about this issue.
>
> --
> Kevin Szkudlapski
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20130826/0339e342/attachment.html>

Kévin Szkudlapski

2013-Sep-01 19:37 UTC

head link

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

Hi Michael,

On Tue, Aug 27, 2013 at 5:18 AM, Michael Lewis <don.apoch at gmail.com>
wrote:> It's not a solution to the actual bug (which is, as the thread you
linked
> discusses, a problem with the assumption on LLVM's part that the
__chkstk
> function lies within 2GB of the emitted code's address space) but there
is a
> simple workaround: hoist all allocas to the first basic block of your
> function. This allows the JIT to perform all stack allocations in a single
> adjustment of the SP instead of needing to use dynamic stack allocation,
and
> thereby avoids the call to __chkstk entirely.Thanks for you answer, I redesigned my code and it's look ok now.
However, I tried to insert a breakpoint at ``if (NumBytes >= 4096 &&
STI.isTargetCOFF() && !STI.isTargetEnvMacho())'' but the
condition is
never met.

So I'm still wondering where, in LLVM, the call to __chkstk is
emitted. If anybody has an idea, please let me know. :)


-- 
Kevin Szkudlapski

Anton Korobeynikov

2013-Sep-01 20:10 UTC

head link

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

> It's not a solution to the actual bug (which is, as the thread you
linked
> discusses, a problem with the assumption on LLVM's part that the
__chkstk
> function lies within 2GB of the emitted code's address space) but there
is a
> simple workaround: hoist all allocas to the first basic block of your
> function. This allows the JIT to perform all stack allocations in a single
> adjustment of the SP instead of needing to use dynamic stack allocation,
and
> thereby avoids the call to __chkstk entirely.__chkstk is not connected with dynamic stack allocation at all, in
general. On Windows when one allocates more than 1 page (4kb) of
stack, it is necessary to touch all all the allocated space in order
to ensure the proper order of guard page allocation. Surely, it's
always required for dynamic stack allocation, because the amount of
allocation is not known in advance, however, it can be triggered for
static code as well. Consider e.g.

void bar(int*);
void baz() {
  int foo[2000];
  bar(foo);
}

-- 
With best regards, Anton Korobeynikov
Faculty of Mathematics and Mechanics, Saint Petersburg State University

Possibly Parallel Threads

Search for more possibly parallel threads

llvm dev - Aug 2013 - [LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

Possibly Parallel Threads