Andrew Kelley via llvm-dev
2018-Apr-26 03:44 UTC
[llvm-dev] windows ABI problem with i128?
I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
command from the compiler-rt project:
[nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S
-emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows
-DCRT_HAS_128BIT
The resulting LLVM IR is:
================================================================
; ModuleID = 'lib/builtins/udivti3.c'
source_filename = "lib/builtins/udivti3.c"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64--windows-msvc19.11.0"
; Function Attrs: noinline nounwind optnone uwtable
define i128 @__udivti3(i128, i128) #0 {
%3 = alloca i128, align 16
%4 = alloca i128, align 16
store i128 %1, i128* %3, align 16
store i128 %0, i128* %4, align 16
%5 = load i128, i128* %3, align 16
%6 = load i128, i128* %4, align 16
%7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
ret i128 %7
}
declare i128 @__udivmodti4(i128, i128, i128*) #1
attributes #0 = { noinline nounwind optnone uwtable
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false"
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="false"
"no-infs-fp-math"="false"
"no-jump-tables"="false"
"no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
"no-trapping-math"="false"
"stack-protector-buffer-size"="8"
"target-cpu"="x86-64"
"target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
attributes #1 = {
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false"
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="false"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
"no-trapping-math"="false"
"stack-protector-buffer-size"="8"
"target-cpu"="x86-64"
"target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
================================================================However I think
this results in a different ABI than LLVM will use when you
do i128 division. For example, here is my test case (in zig code):
================================================================
pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint)
noreturn;
export fn WinMainCRTStartup() noreturn {
@setAlignStack(16);
@setRuntimeSafety(false);
var a: u128 = 152313999999999991610955792383;
var b: u128 = 10000000000000000000;
var c = a / b; // this generates a call to __udivti3
if (c != b) {
@breakpoint();
}
ExitProcess(0);
}
export fn __udivti3(a: u128, b: u128) u128 {
@setRuntimeSafety(false);
return b;
}
================================================================This results in
this LLVM IR:
================================================================
; ModuleID = 'test'
source_filename = "test"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"
%"[]u8" = type { i8*, i64 }
%StackTrace = type { i64, %"[]usize" }
%"[]usize" = type { i64*, i64 }
; Function Attrs: nounwind readnone speculatable
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
alignstack(16)
define void @WinMainCRTStartup() #2 !dbg !41 {
Entry:
%a = alloca i128, align 8
%b = alloca i128, align 8
%c = alloca i128, align 8
store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
!DIExpression()), !dbg !52
store i128 10000000000000000000, i128* %b, align 8, !dbg !53
call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
!DIExpression()), !dbg !53
%0 = load i128, i128* %a, align 8, !dbg !54
%1 = load i128, i128* %b, align 8, !dbg !55
%2 = udiv i128 %0, %1, !dbg !56
store i128 %2, i128* %c, align 8, !dbg !57
call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
!DIExpression()), !dbg !57
%3 = load i128, i128* %c, align 8, !dbg !58
%4 = load i128, i128* %b, align 8, !dbg !60
%5 = icmp ne i128 %3, %4, !dbg !61
br i1 %5, label %Then, label %Else, !dbg !61
Then: ; preds = %Entry
call void @llvm.debugtrap(), !dbg !62
br label %EndIf, !dbg !64
Else: ; preds = %Entry
br label %EndIf, !dbg !64
EndIf: ; preds = %Else, %Then
call void @ExitProcess(i32 0), !dbg !65
unreachable, !dbg !65
}
; Function Attrs: nounwind
declare void @llvm.debugtrap() #3
; Function Attrs: nobuiltin noreturn nounwind uwtable
declare void @ExitProcess(i32) #0
; Function Attrs: nobuiltin nounwind uwtable
define i128 @__udivti3(i128, i128) #4 !dbg !66 {
Entry:
%a = alloca i128, align 8
%b = alloca i128, align 8
store i128 %0, i128* %a, align 8
call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
!DIExpression()), !dbg !73
store i128 %1, i128* %b, align 8
call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
!DIExpression()), !dbg !74
%2 = load i128, i128* %b, align 8, !dbg !75
ret i128 %2, !dbg !78
}
; Function Attrs: nounwind
declare void @llvm.stackprotector(i8*, i8**) #3
attributes #0 = { nobuiltin noreturn nounwind uwtable
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nobuiltin noinline noreturn nounwind uwtable
alignstack=16 "no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf" }
attributes #3 = { nounwind }
attributes #4 = { nobuiltin nounwind uwtable
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf" }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
================================================================
When I link this with (link.exe or LLD, it does not matter):
link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
kernel32.lib /nologo
And run it, it triggers the breakpoint.
Meanwhile on linux, this test passes.
I suspect it may be a calling convention issue. Here is the assembly for
the linux x86_64 version:
================================================================0000000000000010
<_start>:
10: 55 push %rbp
11: 48 89 e5 mov %rsp,%rbp
14: 48 83 ec 40 sub $0x40,%rsp
18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax
1f: 00 00 00
22: 48 89 45 f8 mov %rax,-0x8(%rbp)
26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax
2d: 77 73 ff
30: 48 89 45 f0 mov %rax,-0x10(%rbp)
34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax
3b: 23 c7 8a
3e: 48 89 45 e0 mov %rax,-0x20(%rbp)
42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp)
49: 00
4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi
4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi
52: 48 8b 55 e0 mov -0x20(%rbp),%rdx
56: 48 8b 4d e8 mov -0x18(%rbp),%rcx
5a: e8 00 00 00 00 callq 5f <_start+0x4f>
5f: 48 89 55 d8 mov %rdx,-0x28(%rbp)
63: 48 89 45 d0 mov %rax,-0x30(%rbp)
67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0
6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1
71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0
75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d
79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d
80: 44 89 45 cc mov %r8d,-0x34(%rbp)
84: 74 06 je 8c <_start+0x7c>
86: eb 00 jmp 88 <_start+0x78>
88: eb 00 jmp 8a <_start+0x7a>
8a: eb fe jmp 8a <_start+0x7a>
8c: eb 00 jmp 8e <_start+0x7e>
8e: 48 83 c4 40 add $0x40,%rsp
92: 5d pop %rbp
93: c3 retq
94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1)
9b: 00 00 00 00 00
00000000000000a0 <__udivti3>:
a0: 55 push %rbp
a1: 48 89 e5 mov %rsp,%rbp
a4: 48 89 7d f0 mov %rdi,-0x10(%rbp)
a8: 48 89 75 f8 mov %rsi,-0x8(%rbp)
ac: 48 89 4d e8 mov %rcx,-0x18(%rbp)
b0: 48 89 55 e0 mov %rdx,-0x20(%rbp)
b4: 48 8b 45 e0 mov -0x20(%rbp),%rax
b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx
bc: 5d pop %rbp
bd: c3 retq
================================================================
And here is the assembly for the windows x86_64 version:
================================================================0000000000000010
<_start>:
10: 55 push %rbp
11: 48 81 ec 80 00 00 00 sub $0x80,%rsp
18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp
1f: 00
20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax
27: 00 00 00
2a: 48 89 45 f8 mov %rax,-0x8(%rbp)
2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax
35: 77 73 ff
38: 48 89 45 f0 mov %rax,-0x10(%rbp)
3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax
43: 23 c7 8a
46: 48 89 45 e0 mov %rax,-0x20(%rbp)
4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp)
51: 00
52: 48 8b 45 f0 mov -0x10(%rbp),%rax
56: 48 8b 4d f8 mov -0x8(%rbp),%rcx
5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx
5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8
62: 48 89 4d c8 mov %rcx,-0x38(%rbp)
66: 48 89 45 c0 mov %rax,-0x40(%rbp)
6a: 4c 89 45 b8 mov %r8,-0x48(%rbp)
6e: 48 89 55 b0 mov %rdx,-0x50(%rbp)
72: 48 8d 4d c0 lea -0x40(%rbp),%rcx
76: 48 8d 55 b0 lea -0x50(%rbp),%rdx
7a: e8 41 00 00 00 callq c0 <__udivti3>
7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1
84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp)
89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp)
8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0
92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1
96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0
9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d
9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d
a6: 44 89 4d ac mov %r9d,-0x54(%rbp)
aa: 74 06 je b2 <_start+0xa2>
ac: eb 00 jmp ae <_start+0x9e>
ae: eb 00 jmp b0 <_start+0xa0>
b0: eb fe jmp b0 <_start+0xa0>
b2: eb 00 jmp b4 <_start+0xa4>
b4: 48 81 c4 80 00 00 00 add $0x80,%rsp
bb: 5d pop %rbp
bc: c3 retq
bd: 90 nop
be: 90 nop
bf: 90 nop
00000000000000c0 <__udivti3>:
c0: 55 push %rbp
c1: 48 83 ec 20 sub $0x20,%rsp
c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp
ca: 48 89 4d f0 mov %rcx,-0x10(%rbp)
ce: 48 89 55 f8 mov %rdx,-0x8(%rbp)
d2: 4c 89 4d e8 mov %r9,-0x18(%rbp)
d6: 4c 89 45 e0 mov %r8,-0x20(%rbp)
da: 48 8b 45 e0 mov -0x20(%rbp),%rax
de: 48 8b 55 e8 mov -0x18(%rbp),%rdx
e2: 48 83 c4 20 add $0x20,%rsp
e6: 5d pop %rbp
e7: c3 retq
================================================================
Finally, my question:
What is the correct LLVM IR to represent i128 values so that it will be
compatible with the compiler-rt calls that LLVM generates? For example,
what should be the LLVM IR definition of __udivti3?
Because even though clang/compiler-rt project generates `define i128
@__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
windows.
Thanks,
Andrew
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20180425/e8b7cbf8/attachment.html>
Anton Korobeynikov via llvm-dev
2018-Apr-26 07:44 UTC
[llvm-dev] windows ABI problem with i128?
Most probably you need to properly specify the calling convention the backend is using for calling the runtime functions. Or implement the stub for udivti3 that performs the necessary argument lifting. I guess there is no standard ABI document describing the intended calling convention here, so I'd just do what mingw64 does here and make everything here compatible. On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev <llvm-dev at lists.llvm.org> wrote:> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this > command from the compiler-rt project: > > [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S > -emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows > -DCRT_HAS_128BIT > > The resulting LLVM IR is: > ================================================================> > ; ModuleID = 'lib/builtins/udivti3.c' > source_filename = "lib/builtins/udivti3.c" > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64--windows-msvc19.11.0" > > ; Function Attrs: noinline nounwind optnone uwtable > define i128 @__udivti3(i128, i128) #0 { > %3 = alloca i128, align 16 > %4 = alloca i128, align 16 > store i128 %1, i128* %3, align 16 > store i128 %0, i128* %4, align 16 > %5 = load i128, i128* %3, align 16 > %6 = load i128, i128* %4, align 16 > %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null) > ret i128 %7 > } > > declare i128 @__udivmodti4(i128, i128, i128*) #1 > > attributes #0 = { noinline nounwind optnone uwtable > "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" > "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" > "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" > "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" > "stack-protector-buffer-size"="8" "target-cpu"="x86-64" > "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" > "use-soft-float"="false" } > attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" "less-precise-fpmad"="false" > "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > "unsafe-fp-math"="false" "use-soft-float"="false" } > > !llvm.module.flags = !{!0, !1} > !llvm.ident = !{!2} > > !0 = !{i32 1, !"wchar_size", i32 2} > !1 = !{i32 7, !"PIC Level", i32 2} > !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"} > > > ================================================================> However I think this results in a different ABI than LLVM will use when you > do i128 division. For example, here is my test case (in zig code): > ================================================================> > pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn; > > export fn WinMainCRTStartup() noreturn { > @setAlignStack(16); > @setRuntimeSafety(false); > > var a: u128 = 152313999999999991610955792383; > var b: u128 = 10000000000000000000; > var c = a / b; // this generates a call to __udivti3 > > if (c != b) { > @breakpoint(); > } > ExitProcess(0); > } > > export fn __udivti3(a: u128, b: u128) u128 { > @setRuntimeSafety(false); > return b; > } > > > ================================================================> This results in this LLVM IR: > ================================================================> > ; ModuleID = 'test' > source_filename = "test" > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-pc-windows-msvc" > > %"[]u8" = type { i8*, i64 } > %StackTrace = type { i64, %"[]usize" } > %"[]usize" = type { i64*, i64 } > > ; Function Attrs: nounwind readnone speculatable > declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 > > ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable > alignstack(16) > define void @WinMainCRTStartup() #2 !dbg !41 { > Entry: > %a = alloca i128, align 8 > %b = alloca i128, align 8 > %c = alloca i128, align 8 > store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52 > call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata > !DIExpression()), !dbg !52 > store i128 10000000000000000000, i128* %b, align 8, !dbg !53 > call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata > !DIExpression()), !dbg !53 > %0 = load i128, i128* %a, align 8, !dbg !54 > %1 = load i128, i128* %b, align 8, !dbg !55 > %2 = udiv i128 %0, %1, !dbg !56 > store i128 %2, i128* %c, align 8, !dbg !57 > call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata > !DIExpression()), !dbg !57 > %3 = load i128, i128* %c, align 8, !dbg !58 > %4 = load i128, i128* %b, align 8, !dbg !60 > %5 = icmp ne i128 %3, %4, !dbg !61 > br i1 %5, label %Then, label %Else, !dbg !61 > > Then: ; preds = %Entry > call void @llvm.debugtrap(), !dbg !62 > br label %EndIf, !dbg !64 > > Else: ; preds = %Entry > br label %EndIf, !dbg !64 > > EndIf: ; preds = %Else, %Then > call void @ExitProcess(i32 0), !dbg !65 > unreachable, !dbg !65 > } > > ; Function Attrs: nounwind > declare void @llvm.debugtrap() #3 > > ; Function Attrs: nobuiltin noreturn nounwind uwtable > declare void @ExitProcess(i32) #0 > > ; Function Attrs: nobuiltin nounwind uwtable > define i128 @__udivti3(i128, i128) #4 !dbg !66 { > Entry: > %a = alloca i128, align 8 > %b = alloca i128, align 8 > store i128 %0, i128* %a, align 8 > call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata > !DIExpression()), !dbg !73 > store i128 %1, i128* %b, align 8 > call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata > !DIExpression()), !dbg !74 > %2 = load i128, i128* %b, align 8, !dbg !75 > ret i128 %2, !dbg !78 > } > > ; Function Attrs: nounwind > declare void @llvm.stackprotector(i8*, i8**) #3 > > attributes #0 = { nobuiltin noreturn nounwind uwtable > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > attributes #1 = { nounwind readnone speculatable } > attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16 > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > attributes #3 = { nounwind } > attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true" > "no-frame-pointer-elim-non-leaf" } > > !llvm.module.flags = !{!0} > !llvm.dbg.cu = !{!1} > > ================================================================> > When I link this with (link.exe or LLD, it does not matter): > link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console > kernel32.lib /nologo > > And run it, it triggers the breakpoint. > > Meanwhile on linux, this test passes. > > I suspect it may be a calling convention issue. Here is the assembly for the > linux x86_64 version: > > > ================================================================> 0000000000000010 <_start>: > 10: 55 push %rbp > 11: 48 89 e5 mov %rsp,%rbp > 14: 48 83 ec 40 sub $0x40,%rsp > 18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > 1f: 00 00 00 > 22: 48 89 45 f8 mov %rax,-0x8(%rbp) > 26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > 2d: 77 73 ff > 30: 48 89 45 f0 mov %rax,-0x10(%rbp) > 34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > 3b: 23 c7 8a > 3e: 48 89 45 e0 mov %rax,-0x20(%rbp) > 42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > 49: 00 > 4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi > 4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi > 52: 48 8b 55 e0 mov -0x20(%rbp),%rdx > 56: 48 8b 4d e8 mov -0x18(%rbp),%rcx > 5a: e8 00 00 00 00 callq 5f <_start+0x4f> > 5f: 48 89 55 d8 mov %rdx,-0x28(%rbp) > 63: 48 89 45 d0 mov %rax,-0x30(%rbp) > 67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0 > 6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1 > 71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0 > 75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d > 79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d > 80: 44 89 45 cc mov %r8d,-0x34(%rbp) > 84: 74 06 je 8c <_start+0x7c> > 86: eb 00 jmp 88 <_start+0x78> > 88: eb 00 jmp 8a <_start+0x7a> > 8a: eb fe jmp 8a <_start+0x7a> > 8c: eb 00 jmp 8e <_start+0x7e> > 8e: 48 83 c4 40 add $0x40,%rsp > 92: 5d pop %rbp > 93: c3 retq > 94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1) > 9b: 00 00 00 00 00 > > 00000000000000a0 <__udivti3>: > a0: 55 push %rbp > a1: 48 89 e5 mov %rsp,%rbp > a4: 48 89 7d f0 mov %rdi,-0x10(%rbp) > a8: 48 89 75 f8 mov %rsi,-0x8(%rbp) > ac: 48 89 4d e8 mov %rcx,-0x18(%rbp) > b0: 48 89 55 e0 mov %rdx,-0x20(%rbp) > b4: 48 8b 45 e0 mov -0x20(%rbp),%rax > b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx > bc: 5d pop %rbp > bd: c3 retq > > > ================================================================> > And here is the assembly for the windows x86_64 version: > > > ================================================================> 0000000000000010 <_start>: > 10: 55 push %rbp > 11: 48 81 ec 80 00 00 00 sub $0x80,%rsp > 18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp > 1f: 00 > 20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > 27: 00 00 00 > 2a: 48 89 45 f8 mov %rax,-0x8(%rbp) > 2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > 35: 77 73 ff > 38: 48 89 45 f0 mov %rax,-0x10(%rbp) > 3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > 43: 23 c7 8a > 46: 48 89 45 e0 mov %rax,-0x20(%rbp) > 4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > 51: 00 > 52: 48 8b 45 f0 mov -0x10(%rbp),%rax > 56: 48 8b 4d f8 mov -0x8(%rbp),%rcx > 5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx > 5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8 > 62: 48 89 4d c8 mov %rcx,-0x38(%rbp) > 66: 48 89 45 c0 mov %rax,-0x40(%rbp) > 6a: 4c 89 45 b8 mov %r8,-0x48(%rbp) > 6e: 48 89 55 b0 mov %rdx,-0x50(%rbp) > 72: 48 8d 4d c0 lea -0x40(%rbp),%rcx > 76: 48 8d 55 b0 lea -0x50(%rbp),%rdx > 7a: e8 41 00 00 00 callq c0 <__udivti3> > 7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1 > 84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp) > 89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp) > 8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0 > 92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1 > 96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0 > 9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d > 9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d > a6: 44 89 4d ac mov %r9d,-0x54(%rbp) > aa: 74 06 je b2 <_start+0xa2> > ac: eb 00 jmp ae <_start+0x9e> > ae: eb 00 jmp b0 <_start+0xa0> > b0: eb fe jmp b0 <_start+0xa0> > b2: eb 00 jmp b4 <_start+0xa4> > b4: 48 81 c4 80 00 00 00 add $0x80,%rsp > bb: 5d pop %rbp > bc: c3 retq > bd: 90 nop > be: 90 nop > bf: 90 nop > > 00000000000000c0 <__udivti3>: > c0: 55 push %rbp > c1: 48 83 ec 20 sub $0x20,%rsp > c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp > ca: 48 89 4d f0 mov %rcx,-0x10(%rbp) > ce: 48 89 55 f8 mov %rdx,-0x8(%rbp) > d2: 4c 89 4d e8 mov %r9,-0x18(%rbp) > d6: 4c 89 45 e0 mov %r8,-0x20(%rbp) > da: 48 8b 45 e0 mov -0x20(%rbp),%rax > de: 48 8b 55 e8 mov -0x18(%rbp),%rdx > e2: 48 83 c4 20 add $0x20,%rsp > e6: 5d pop %rbp > e7: c3 retq > > ================================================================> > > Finally, my question: > > What is the correct LLVM IR to represent i128 values so that it will be > compatible with the compiler-rt calls that LLVM generates? For example, what > should be the LLVM IR definition of __udivti3? > > Because even though clang/compiler-rt project generates `define i128 > @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on > windows. > > Thanks, > Andrew > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-- With best regards, Anton Korobeynikov Department of Statistical Modelling, Saint Petersburg State University
Andrew Kelley via llvm-dev
2018-Apr-26 15:30 UTC
[llvm-dev] windows ABI problem with i128?
On Thu, Apr 26, 2018 at 3:44 AM, Anton Korobeynikov <anton at korobeynikov.info> wrote:> Most probably you need to properly specify the calling convention the > backend is using for calling the runtime functions.Thanks for the tip. Can you be more specific? Are you suggesting there is some config parameter I can set before running TargetMachineEmitToFile? Do you know what calling convention it is trying to use at the callsite? Perhaps I can simply select a different convention from this list for the implementation of udivti3? http://llvm.org/docs/LangRef.html#calling-conventions Or implement the> stub for udivti3 that performs the necessary argument lifting. > > I guess there is no standard ABI document describing the intended > calling convention here, so I'd just do what mingw64 does here and > make everything here compatible. >> On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev > <llvm-dev at lists.llvm.org> wrote: > > I'm trying to use LLVM to create compiler-rt.o on Windows. I use this > > command from the compiler-rt project: > > > > [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S > > -emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows > > -DCRT_HAS_128BIT > > > > The resulting LLVM IR is: > > ================================================================> > > > ; ModuleID = 'lib/builtins/udivti3.c' > > source_filename = "lib/builtins/udivti3.c" > > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > > target triple = "x86_64--windows-msvc19.11.0" > > > > ; Function Attrs: noinline nounwind optnone uwtable > > define i128 @__udivti3(i128, i128) #0 { > > %3 = alloca i128, align 16 > > %4 = alloca i128, align 16 > > store i128 %1, i128* %3, align 16 > > store i128 %0, i128* %4, align 16 > > %5 = load i128, i128* %3, align 16 > > %6 = load i128, i128* %4, align 16 > > %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null) > > ret i128 %7 > > } > > > > declare i128 @__udivmodti4(i128, i128, i128*) #1 > > > > attributes #0 = { noinline nounwind optnone uwtable > > "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" > > "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" > > "no-infs-fp-math"="false" "no-jump-tables"="false" > "no-nans-fp-math"="false" > > "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" > > "stack-protector-buffer-size"="8" "target-cpu"="x86-64" > > "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" > > "use-soft-float"="false" } > > attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" > > "disable-tail-calls"="false" "less-precise-fpmad"="false" > > "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" > > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > > "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > > "unsafe-fp-math"="false" "use-soft-float"="false" } > > > > !llvm.module.flags = !{!0, !1} > > !llvm.ident = !{!2} > > > > !0 = !{i32 1, !"wchar_size", i32 2} > > !1 = !{i32 7, !"PIC Level", i32 2} > > !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"} > > > > > > ================================================================> > However I think this results in a different ABI than LLVM will use when > you > > do i128 division. For example, here is my test case (in zig code): > > ================================================================> > > > pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) > noreturn; > > > > export fn WinMainCRTStartup() noreturn { > > @setAlignStack(16); > > @setRuntimeSafety(false); > > > > var a: u128 = 152313999999999991610955792383; > > var b: u128 = 10000000000000000000; > > var c = a / b; // this generates a call to __udivti3 > > > > if (c != b) { > > @breakpoint(); > > } > > ExitProcess(0); > > } > > > > export fn __udivti3(a: u128, b: u128) u128 { > > @setRuntimeSafety(false); > > return b; > > } > > > > > > ================================================================> > This results in this LLVM IR: > > ================================================================> > > > ; ModuleID = 'test' > > source_filename = "test" > > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > > target triple = "x86_64-pc-windows-msvc" > > > > %"[]u8" = type { i8*, i64 } > > %StackTrace = type { i64, %"[]usize" } > > %"[]usize" = type { i64*, i64 } > > > > ; Function Attrs: nounwind readnone speculatable > > declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 > > > > ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable > > alignstack(16) > > define void @WinMainCRTStartup() #2 !dbg !41 { > > Entry: > > %a = alloca i128, align 8 > > %b = alloca i128, align 8 > > %c = alloca i128, align 8 > > store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52 > > call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata > > !DIExpression()), !dbg !52 > > store i128 10000000000000000000, i128* %b, align 8, !dbg !53 > > call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata > > !DIExpression()), !dbg !53 > > %0 = load i128, i128* %a, align 8, !dbg !54 > > %1 = load i128, i128* %b, align 8, !dbg !55 > > %2 = udiv i128 %0, %1, !dbg !56 > > store i128 %2, i128* %c, align 8, !dbg !57 > > call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata > > !DIExpression()), !dbg !57 > > %3 = load i128, i128* %c, align 8, !dbg !58 > > %4 = load i128, i128* %b, align 8, !dbg !60 > > %5 = icmp ne i128 %3, %4, !dbg !61 > > br i1 %5, label %Then, label %Else, !dbg !61 > > > > Then: ; preds = %Entry > > call void @llvm.debugtrap(), !dbg !62 > > br label %EndIf, !dbg !64 > > > > Else: ; preds = %Entry > > br label %EndIf, !dbg !64 > > > > EndIf: ; preds = %Else, %Then > > call void @ExitProcess(i32 0), !dbg !65 > > unreachable, !dbg !65 > > } > > > > ; Function Attrs: nounwind > > declare void @llvm.debugtrap() #3 > > > > ; Function Attrs: nobuiltin noreturn nounwind uwtable > > declare void @ExitProcess(i32) #0 > > > > ; Function Attrs: nobuiltin nounwind uwtable > > define i128 @__udivti3(i128, i128) #4 !dbg !66 { > > Entry: > > %a = alloca i128, align 8 > > %b = alloca i128, align 8 > > store i128 %0, i128* %a, align 8 > > call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata > > !DIExpression()), !dbg !73 > > store i128 %1, i128* %b, align 8 > > call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata > > !DIExpression()), !dbg !74 > > %2 = load i128, i128* %b, align 8, !dbg !75 > > ret i128 %2, !dbg !78 > > } > > > > ; Function Attrs: nounwind > > declare void @llvm.stackprotector(i8*, i8**) #3 > > > > attributes #0 = { nobuiltin noreturn nounwind uwtable > > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > > attributes #1 = { nounwind readnone speculatable } > > attributes #2 = { nobuiltin noinline noreturn nounwind uwtable > alignstack=16 > > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > > attributes #3 = { nounwind } > > attributes #4 = { nobuiltin nounwind uwtable > "no-frame-pointer-elim"="true" > > "no-frame-pointer-elim-non-leaf" } > > > > !llvm.module.flags = !{!0} > > !llvm.dbg.cu = !{!1} > > > > ================================================================> > > > When I link this with (link.exe or LLD, it does not matter): > > link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj > /subsystem:console > > kernel32.lib /nologo > > > > And run it, it triggers the breakpoint. > > > > Meanwhile on linux, this test passes. > > > > I suspect it may be a calling convention issue. Here is the assembly for > the > > linux x86_64 version: > > > > > > ================================================================> > 0000000000000010 <_start>: > > 10: 55 push %rbp > > 11: 48 89 e5 mov %rsp,%rbp > > 14: 48 83 ec 40 sub $0x40,%rsp > > 18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > > 1f: 00 00 00 > > 22: 48 89 45 f8 mov %rax,-0x8(%rbp) > > 26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > > 2d: 77 73 ff > > 30: 48 89 45 f0 mov %rax,-0x10(%rbp) > > 34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > > 3b: 23 c7 8a > > 3e: 48 89 45 e0 mov %rax,-0x20(%rbp) > > 42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > > 49: 00 > > 4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi > > 4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi > > 52: 48 8b 55 e0 mov -0x20(%rbp),%rdx > > 56: 48 8b 4d e8 mov -0x18(%rbp),%rcx > > 5a: e8 00 00 00 00 callq 5f <_start+0x4f> > > 5f: 48 89 55 d8 mov %rdx,-0x28(%rbp) > > 63: 48 89 45 d0 mov %rax,-0x30(%rbp) > > 67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0 > > 6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1 > > 71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0 > > 75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d > > 79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d > > 80: 44 89 45 cc mov %r8d,-0x34(%rbp) > > 84: 74 06 je 8c <_start+0x7c> > > 86: eb 00 jmp 88 <_start+0x78> > > 88: eb 00 jmp 8a <_start+0x7a> > > 8a: eb fe jmp 8a <_start+0x7a> > > 8c: eb 00 jmp 8e <_start+0x7e> > > 8e: 48 83 c4 40 add $0x40,%rsp > > 92: 5d pop %rbp > > 93: c3 retq > > 94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1) > > 9b: 00 00 00 00 00 > > > > 00000000000000a0 <__udivti3>: > > a0: 55 push %rbp > > a1: 48 89 e5 mov %rsp,%rbp > > a4: 48 89 7d f0 mov %rdi,-0x10(%rbp) > > a8: 48 89 75 f8 mov %rsi,-0x8(%rbp) > > ac: 48 89 4d e8 mov %rcx,-0x18(%rbp) > > b0: 48 89 55 e0 mov %rdx,-0x20(%rbp) > > b4: 48 8b 45 e0 mov -0x20(%rbp),%rax > > b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx > > bc: 5d pop %rbp > > bd: c3 retq > > > > > > ================================================================> > > > And here is the assembly for the windows x86_64 version: > > > > > > ================================================================> > 0000000000000010 <_start>: > > 10: 55 push %rbp > > 11: 48 81 ec 80 00 00 00 sub $0x80,%rsp > > 18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp > > 1f: 00 > > 20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > > 27: 00 00 00 > > 2a: 48 89 45 f8 mov %rax,-0x8(%rbp) > > 2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > > 35: 77 73 ff > > 38: 48 89 45 f0 mov %rax,-0x10(%rbp) > > 3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > > 43: 23 c7 8a > > 46: 48 89 45 e0 mov %rax,-0x20(%rbp) > > 4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > > 51: 00 > > 52: 48 8b 45 f0 mov -0x10(%rbp),%rax > > 56: 48 8b 4d f8 mov -0x8(%rbp),%rcx > > 5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx > > 5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8 > > 62: 48 89 4d c8 mov %rcx,-0x38(%rbp) > > 66: 48 89 45 c0 mov %rax,-0x40(%rbp) > > 6a: 4c 89 45 b8 mov %r8,-0x48(%rbp) > > 6e: 48 89 55 b0 mov %rdx,-0x50(%rbp) > > 72: 48 8d 4d c0 lea -0x40(%rbp),%rcx > > 76: 48 8d 55 b0 lea -0x50(%rbp),%rdx > > 7a: e8 41 00 00 00 callq c0 <__udivti3> > > 7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1 > > 84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp) > > 89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp) > > 8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0 > > 92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1 > > 96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0 > > 9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d > > 9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d > > a6: 44 89 4d ac mov %r9d,-0x54(%rbp) > > aa: 74 06 je b2 <_start+0xa2> > > ac: eb 00 jmp ae <_start+0x9e> > > ae: eb 00 jmp b0 <_start+0xa0> > > b0: eb fe jmp b0 <_start+0xa0> > > b2: eb 00 jmp b4 <_start+0xa4> > > b4: 48 81 c4 80 00 00 00 add $0x80,%rsp > > bb: 5d pop %rbp > > bc: c3 retq > > bd: 90 nop > > be: 90 nop > > bf: 90 nop > > > > 00000000000000c0 <__udivti3>: > > c0: 55 push %rbp > > c1: 48 83 ec 20 sub $0x20,%rsp > > c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp > > ca: 48 89 4d f0 mov %rcx,-0x10(%rbp) > > ce: 48 89 55 f8 mov %rdx,-0x8(%rbp) > > d2: 4c 89 4d e8 mov %r9,-0x18(%rbp) > > d6: 4c 89 45 e0 mov %r8,-0x20(%rbp) > > da: 48 8b 45 e0 mov -0x20(%rbp),%rax > > de: 48 8b 55 e8 mov -0x18(%rbp),%rdx > > e2: 48 83 c4 20 add $0x20,%rsp > > e6: 5d pop %rbp > > e7: c3 retq > > > > ================================================================> > > > > > Finally, my question: > > > > What is the correct LLVM IR to represent i128 values so that it will be > > compatible with the compiler-rt calls that LLVM generates? For example, > what > > should be the LLVM IR definition of __udivti3? > > > > Because even though clang/compiler-rt project generates `define i128 > > @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on > > windows. > > > > Thanks, > > Andrew > > > > _______________________________________________ > > LLVM Developers mailing list > > llvm-dev at lists.llvm.org > > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev > > > > > > -- > With best regards, Anton Korobeynikov > Department of Statistical Modelling, Saint Petersburg State University >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180426/b98506cf/attachment.html>