Andrew Kelley via llvm-dev
2018-Apr-26 03:44 UTC
[llvm-dev] windows ABI problem with i128?
I'm trying to use LLVM to create compiler-rt.o on Windows. I use this command from the compiler-rt project: [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S -emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows -DCRT_HAS_128BIT The resulting LLVM IR is: ================================================================ ; ModuleID = 'lib/builtins/udivti3.c' source_filename = "lib/builtins/udivti3.c" target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--windows-msvc19.11.0" ; Function Attrs: noinline nounwind optnone uwtable define i128 @__udivti3(i128, i128) #0 { %3 = alloca i128, align 16 %4 = alloca i128, align 16 store i128 %1, i128* %3, align 16 store i128 %0, i128* %4, align 16 %5 = load i128, i128* %3, align 16 %6 = load i128, i128* %4, align 16 %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null) ret i128 %7 } declare i128 @__udivmodti4(i128, i128, i128*) #1 attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} !0 = !{i32 1, !"wchar_size", i32 2} !1 = !{i32 7, !"PIC Level", i32 2} !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"} ================================================================However I think this results in a different ABI than LLVM will use when you do i128 division. For example, here is my test case (in zig code): ================================================================ pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn; export fn WinMainCRTStartup() noreturn { @setAlignStack(16); @setRuntimeSafety(false); var a: u128 = 152313999999999991610955792383; var b: u128 = 10000000000000000000; var c = a / b; // this generates a call to __udivti3 if (c != b) { @breakpoint(); } ExitProcess(0); } export fn __udivti3(a: u128, b: u128) u128 { @setRuntimeSafety(false); return b; } ================================================================This results in this LLVM IR: ================================================================ ; ModuleID = 'test' source_filename = "test" target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc" %"[]u8" = type { i8*, i64 } %StackTrace = type { i64, %"[]usize" } %"[]usize" = type { i64*, i64 } ; Function Attrs: nounwind readnone speculatable declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable alignstack(16) define void @WinMainCRTStartup() #2 !dbg !41 { Entry: %a = alloca i128, align 8 %b = alloca i128, align 8 %c = alloca i128, align 8 store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52 call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata !DIExpression()), !dbg !52 store i128 10000000000000000000, i128* %b, align 8, !dbg !53 call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata !DIExpression()), !dbg !53 %0 = load i128, i128* %a, align 8, !dbg !54 %1 = load i128, i128* %b, align 8, !dbg !55 %2 = udiv i128 %0, %1, !dbg !56 store i128 %2, i128* %c, align 8, !dbg !57 call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata !DIExpression()), !dbg !57 %3 = load i128, i128* %c, align 8, !dbg !58 %4 = load i128, i128* %b, align 8, !dbg !60 %5 = icmp ne i128 %3, %4, !dbg !61 br i1 %5, label %Then, label %Else, !dbg !61 Then: ; preds = %Entry call void @llvm.debugtrap(), !dbg !62 br label %EndIf, !dbg !64 Else: ; preds = %Entry br label %EndIf, !dbg !64 EndIf: ; preds = %Else, %Then call void @ExitProcess(i32 0), !dbg !65 unreachable, !dbg !65 } ; Function Attrs: nounwind declare void @llvm.debugtrap() #3 ; Function Attrs: nobuiltin noreturn nounwind uwtable declare void @ExitProcess(i32) #0 ; Function Attrs: nobuiltin nounwind uwtable define i128 @__udivti3(i128, i128) #4 !dbg !66 { Entry: %a = alloca i128, align 8 %b = alloca i128, align 8 store i128 %0, i128* %a, align 8 call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata !DIExpression()), !dbg !73 store i128 %1, i128* %b, align 8 call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata !DIExpression()), !dbg !74 %2 = load i128, i128* %b, align 8, !dbg !75 ret i128 %2, !dbg !78 } ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #3 attributes #0 = { nobuiltin noreturn nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } attributes #1 = { nounwind readnone speculatable } attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16 "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } attributes #3 = { nounwind } attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} ================================================================ When I link this with (link.exe or LLD, it does not matter): link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console kernel32.lib /nologo And run it, it triggers the breakpoint. Meanwhile on linux, this test passes. I suspect it may be a calling convention issue. Here is the assembly for the linux x86_64 version: ================================================================0000000000000010 <_start>: 10: 55 push %rbp 11: 48 89 e5 mov %rsp,%rbp 14: 48 83 ec 40 sub $0x40,%rsp 18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax 1f: 00 00 00 22: 48 89 45 f8 mov %rax,-0x8(%rbp) 26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax 2d: 77 73 ff 30: 48 89 45 f0 mov %rax,-0x10(%rbp) 34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax 3b: 23 c7 8a 3e: 48 89 45 e0 mov %rax,-0x20(%rbp) 42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) 49: 00 4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi 4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi 52: 48 8b 55 e0 mov -0x20(%rbp),%rdx 56: 48 8b 4d e8 mov -0x18(%rbp),%rcx 5a: e8 00 00 00 00 callq 5f <_start+0x4f> 5f: 48 89 55 d8 mov %rdx,-0x28(%rbp) 63: 48 89 45 d0 mov %rax,-0x30(%rbp) 67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0 6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1 71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0 75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d 79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d 80: 44 89 45 cc mov %r8d,-0x34(%rbp) 84: 74 06 je 8c <_start+0x7c> 86: eb 00 jmp 88 <_start+0x78> 88: eb 00 jmp 8a <_start+0x7a> 8a: eb fe jmp 8a <_start+0x7a> 8c: eb 00 jmp 8e <_start+0x7e> 8e: 48 83 c4 40 add $0x40,%rsp 92: 5d pop %rbp 93: c3 retq 94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1) 9b: 00 00 00 00 00 00000000000000a0 <__udivti3>: a0: 55 push %rbp a1: 48 89 e5 mov %rsp,%rbp a4: 48 89 7d f0 mov %rdi,-0x10(%rbp) a8: 48 89 75 f8 mov %rsi,-0x8(%rbp) ac: 48 89 4d e8 mov %rcx,-0x18(%rbp) b0: 48 89 55 e0 mov %rdx,-0x20(%rbp) b4: 48 8b 45 e0 mov -0x20(%rbp),%rax b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx bc: 5d pop %rbp bd: c3 retq ================================================================ And here is the assembly for the windows x86_64 version: ================================================================0000000000000010 <_start>: 10: 55 push %rbp 11: 48 81 ec 80 00 00 00 sub $0x80,%rsp 18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp 1f: 00 20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax 27: 00 00 00 2a: 48 89 45 f8 mov %rax,-0x8(%rbp) 2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax 35: 77 73 ff 38: 48 89 45 f0 mov %rax,-0x10(%rbp) 3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax 43: 23 c7 8a 46: 48 89 45 e0 mov %rax,-0x20(%rbp) 4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) 51: 00 52: 48 8b 45 f0 mov -0x10(%rbp),%rax 56: 48 8b 4d f8 mov -0x8(%rbp),%rcx 5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx 5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8 62: 48 89 4d c8 mov %rcx,-0x38(%rbp) 66: 48 89 45 c0 mov %rax,-0x40(%rbp) 6a: 4c 89 45 b8 mov %r8,-0x48(%rbp) 6e: 48 89 55 b0 mov %rdx,-0x50(%rbp) 72: 48 8d 4d c0 lea -0x40(%rbp),%rcx 76: 48 8d 55 b0 lea -0x50(%rbp),%rdx 7a: e8 41 00 00 00 callq c0 <__udivti3> 7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1 84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp) 89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp) 8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0 92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1 96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0 9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d 9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d a6: 44 89 4d ac mov %r9d,-0x54(%rbp) aa: 74 06 je b2 <_start+0xa2> ac: eb 00 jmp ae <_start+0x9e> ae: eb 00 jmp b0 <_start+0xa0> b0: eb fe jmp b0 <_start+0xa0> b2: eb 00 jmp b4 <_start+0xa4> b4: 48 81 c4 80 00 00 00 add $0x80,%rsp bb: 5d pop %rbp bc: c3 retq bd: 90 nop be: 90 nop bf: 90 nop 00000000000000c0 <__udivti3>: c0: 55 push %rbp c1: 48 83 ec 20 sub $0x20,%rsp c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp ca: 48 89 4d f0 mov %rcx,-0x10(%rbp) ce: 48 89 55 f8 mov %rdx,-0x8(%rbp) d2: 4c 89 4d e8 mov %r9,-0x18(%rbp) d6: 4c 89 45 e0 mov %r8,-0x20(%rbp) da: 48 8b 45 e0 mov -0x20(%rbp),%rax de: 48 8b 55 e8 mov -0x18(%rbp),%rdx e2: 48 83 c4 20 add $0x20,%rsp e6: 5d pop %rbp e7: c3 retq ================================================================ Finally, my question: What is the correct LLVM IR to represent i128 values so that it will be compatible with the compiler-rt calls that LLVM generates? For example, what should be the LLVM IR definition of __udivti3? Because even though clang/compiler-rt project generates `define i128 @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on windows. Thanks, Andrew -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180425/e8b7cbf8/attachment.html>
Anton Korobeynikov via llvm-dev
2018-Apr-26 07:44 UTC
[llvm-dev] windows ABI problem with i128?
Most probably you need to properly specify the calling convention the backend is using for calling the runtime functions. Or implement the stub for udivti3 that performs the necessary argument lifting. I guess there is no standard ABI document describing the intended calling convention here, so I'd just do what mingw64 does here and make everything here compatible. On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev <llvm-dev at lists.llvm.org> wrote:> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this > command from the compiler-rt project: > > [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S > -emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows > -DCRT_HAS_128BIT > > The resulting LLVM IR is: > ================================================================> > ; ModuleID = 'lib/builtins/udivti3.c' > source_filename = "lib/builtins/udivti3.c" > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64--windows-msvc19.11.0" > > ; Function Attrs: noinline nounwind optnone uwtable > define i128 @__udivti3(i128, i128) #0 { > %3 = alloca i128, align 16 > %4 = alloca i128, align 16 > store i128 %1, i128* %3, align 16 > store i128 %0, i128* %4, align 16 > %5 = load i128, i128* %3, align 16 > %6 = load i128, i128* %4, align 16 > %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null) > ret i128 %7 > } > > declare i128 @__udivmodti4(i128, i128, i128*) #1 > > attributes #0 = { noinline nounwind optnone uwtable > "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" > "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" > "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" > "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" > "stack-protector-buffer-size"="8" "target-cpu"="x86-64" > "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" > "use-soft-float"="false" } > attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" "less-precise-fpmad"="false" > "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > "unsafe-fp-math"="false" "use-soft-float"="false" } > > !llvm.module.flags = !{!0, !1} > !llvm.ident = !{!2} > > !0 = !{i32 1, !"wchar_size", i32 2} > !1 = !{i32 7, !"PIC Level", i32 2} > !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"} > > > ================================================================> However I think this results in a different ABI than LLVM will use when you > do i128 division. For example, here is my test case (in zig code): > ================================================================> > pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn; > > export fn WinMainCRTStartup() noreturn { > @setAlignStack(16); > @setRuntimeSafety(false); > > var a: u128 = 152313999999999991610955792383; > var b: u128 = 10000000000000000000; > var c = a / b; // this generates a call to __udivti3 > > if (c != b) { > @breakpoint(); > } > ExitProcess(0); > } > > export fn __udivti3(a: u128, b: u128) u128 { > @setRuntimeSafety(false); > return b; > } > > > ================================================================> This results in this LLVM IR: > ================================================================> > ; ModuleID = 'test' > source_filename = "test" > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > target triple = "x86_64-pc-windows-msvc" > > %"[]u8" = type { i8*, i64 } > %StackTrace = type { i64, %"[]usize" } > %"[]usize" = type { i64*, i64 } > > ; Function Attrs: nounwind readnone speculatable > declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 > > ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable > alignstack(16) > define void @WinMainCRTStartup() #2 !dbg !41 { > Entry: > %a = alloca i128, align 8 > %b = alloca i128, align 8 > %c = alloca i128, align 8 > store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52 > call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata > !DIExpression()), !dbg !52 > store i128 10000000000000000000, i128* %b, align 8, !dbg !53 > call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata > !DIExpression()), !dbg !53 > %0 = load i128, i128* %a, align 8, !dbg !54 > %1 = load i128, i128* %b, align 8, !dbg !55 > %2 = udiv i128 %0, %1, !dbg !56 > store i128 %2, i128* %c, align 8, !dbg !57 > call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata > !DIExpression()), !dbg !57 > %3 = load i128, i128* %c, align 8, !dbg !58 > %4 = load i128, i128* %b, align 8, !dbg !60 > %5 = icmp ne i128 %3, %4, !dbg !61 > br i1 %5, label %Then, label %Else, !dbg !61 > > Then: ; preds = %Entry > call void @llvm.debugtrap(), !dbg !62 > br label %EndIf, !dbg !64 > > Else: ; preds = %Entry > br label %EndIf, !dbg !64 > > EndIf: ; preds = %Else, %Then > call void @ExitProcess(i32 0), !dbg !65 > unreachable, !dbg !65 > } > > ; Function Attrs: nounwind > declare void @llvm.debugtrap() #3 > > ; Function Attrs: nobuiltin noreturn nounwind uwtable > declare void @ExitProcess(i32) #0 > > ; Function Attrs: nobuiltin nounwind uwtable > define i128 @__udivti3(i128, i128) #4 !dbg !66 { > Entry: > %a = alloca i128, align 8 > %b = alloca i128, align 8 > store i128 %0, i128* %a, align 8 > call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata > !DIExpression()), !dbg !73 > store i128 %1, i128* %b, align 8 > call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata > !DIExpression()), !dbg !74 > %2 = load i128, i128* %b, align 8, !dbg !75 > ret i128 %2, !dbg !78 > } > > ; Function Attrs: nounwind > declare void @llvm.stackprotector(i8*, i8**) #3 > > attributes #0 = { nobuiltin noreturn nounwind uwtable > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > attributes #1 = { nounwind readnone speculatable } > attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16 > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > attributes #3 = { nounwind } > attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true" > "no-frame-pointer-elim-non-leaf" } > > !llvm.module.flags = !{!0} > !llvm.dbg.cu = !{!1} > > ================================================================> > When I link this with (link.exe or LLD, it does not matter): > link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console > kernel32.lib /nologo > > And run it, it triggers the breakpoint. > > Meanwhile on linux, this test passes. > > I suspect it may be a calling convention issue. Here is the assembly for the > linux x86_64 version: > > > ================================================================> 0000000000000010 <_start>: > 10: 55 push %rbp > 11: 48 89 e5 mov %rsp,%rbp > 14: 48 83 ec 40 sub $0x40,%rsp > 18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > 1f: 00 00 00 > 22: 48 89 45 f8 mov %rax,-0x8(%rbp) > 26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > 2d: 77 73 ff > 30: 48 89 45 f0 mov %rax,-0x10(%rbp) > 34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > 3b: 23 c7 8a > 3e: 48 89 45 e0 mov %rax,-0x20(%rbp) > 42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > 49: 00 > 4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi > 4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi > 52: 48 8b 55 e0 mov -0x20(%rbp),%rdx > 56: 48 8b 4d e8 mov -0x18(%rbp),%rcx > 5a: e8 00 00 00 00 callq 5f <_start+0x4f> > 5f: 48 89 55 d8 mov %rdx,-0x28(%rbp) > 63: 48 89 45 d0 mov %rax,-0x30(%rbp) > 67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0 > 6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1 > 71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0 > 75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d > 79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d > 80: 44 89 45 cc mov %r8d,-0x34(%rbp) > 84: 74 06 je 8c <_start+0x7c> > 86: eb 00 jmp 88 <_start+0x78> > 88: eb 00 jmp 8a <_start+0x7a> > 8a: eb fe jmp 8a <_start+0x7a> > 8c: eb 00 jmp 8e <_start+0x7e> > 8e: 48 83 c4 40 add $0x40,%rsp > 92: 5d pop %rbp > 93: c3 retq > 94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1) > 9b: 00 00 00 00 00 > > 00000000000000a0 <__udivti3>: > a0: 55 push %rbp > a1: 48 89 e5 mov %rsp,%rbp > a4: 48 89 7d f0 mov %rdi,-0x10(%rbp) > a8: 48 89 75 f8 mov %rsi,-0x8(%rbp) > ac: 48 89 4d e8 mov %rcx,-0x18(%rbp) > b0: 48 89 55 e0 mov %rdx,-0x20(%rbp) > b4: 48 8b 45 e0 mov -0x20(%rbp),%rax > b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx > bc: 5d pop %rbp > bd: c3 retq > > > ================================================================> > And here is the assembly for the windows x86_64 version: > > > ================================================================> 0000000000000010 <_start>: > 10: 55 push %rbp > 11: 48 81 ec 80 00 00 00 sub $0x80,%rsp > 18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp > 1f: 00 > 20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > 27: 00 00 00 > 2a: 48 89 45 f8 mov %rax,-0x8(%rbp) > 2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > 35: 77 73 ff > 38: 48 89 45 f0 mov %rax,-0x10(%rbp) > 3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > 43: 23 c7 8a > 46: 48 89 45 e0 mov %rax,-0x20(%rbp) > 4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > 51: 00 > 52: 48 8b 45 f0 mov -0x10(%rbp),%rax > 56: 48 8b 4d f8 mov -0x8(%rbp),%rcx > 5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx > 5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8 > 62: 48 89 4d c8 mov %rcx,-0x38(%rbp) > 66: 48 89 45 c0 mov %rax,-0x40(%rbp) > 6a: 4c 89 45 b8 mov %r8,-0x48(%rbp) > 6e: 48 89 55 b0 mov %rdx,-0x50(%rbp) > 72: 48 8d 4d c0 lea -0x40(%rbp),%rcx > 76: 48 8d 55 b0 lea -0x50(%rbp),%rdx > 7a: e8 41 00 00 00 callq c0 <__udivti3> > 7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1 > 84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp) > 89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp) > 8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0 > 92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1 > 96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0 > 9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d > 9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d > a6: 44 89 4d ac mov %r9d,-0x54(%rbp) > aa: 74 06 je b2 <_start+0xa2> > ac: eb 00 jmp ae <_start+0x9e> > ae: eb 00 jmp b0 <_start+0xa0> > b0: eb fe jmp b0 <_start+0xa0> > b2: eb 00 jmp b4 <_start+0xa4> > b4: 48 81 c4 80 00 00 00 add $0x80,%rsp > bb: 5d pop %rbp > bc: c3 retq > bd: 90 nop > be: 90 nop > bf: 90 nop > > 00000000000000c0 <__udivti3>: > c0: 55 push %rbp > c1: 48 83 ec 20 sub $0x20,%rsp > c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp > ca: 48 89 4d f0 mov %rcx,-0x10(%rbp) > ce: 48 89 55 f8 mov %rdx,-0x8(%rbp) > d2: 4c 89 4d e8 mov %r9,-0x18(%rbp) > d6: 4c 89 45 e0 mov %r8,-0x20(%rbp) > da: 48 8b 45 e0 mov -0x20(%rbp),%rax > de: 48 8b 55 e8 mov -0x18(%rbp),%rdx > e2: 48 83 c4 20 add $0x20,%rsp > e6: 5d pop %rbp > e7: c3 retq > > ================================================================> > > Finally, my question: > > What is the correct LLVM IR to represent i128 values so that it will be > compatible with the compiler-rt calls that LLVM generates? For example, what > should be the LLVM IR definition of __udivti3? > > Because even though clang/compiler-rt project generates `define i128 > @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on > windows. > > Thanks, > Andrew > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-- With best regards, Anton Korobeynikov Department of Statistical Modelling, Saint Petersburg State University
Andrew Kelley via llvm-dev
2018-Apr-26 15:30 UTC
[llvm-dev] windows ABI problem with i128?
On Thu, Apr 26, 2018 at 3:44 AM, Anton Korobeynikov <anton at korobeynikov.info> wrote:> Most probably you need to properly specify the calling convention the > backend is using for calling the runtime functions.Thanks for the tip. Can you be more specific? Are you suggesting there is some config parameter I can set before running TargetMachineEmitToFile? Do you know what calling convention it is trying to use at the callsite? Perhaps I can simply select a different convention from this list for the implementation of udivti3? http://llvm.org/docs/LangRef.html#calling-conventions Or implement the> stub for udivti3 that performs the necessary argument lifting. > > I guess there is no standard ABI document describing the intended > calling convention here, so I'd just do what mingw64 does here and > make everything here compatible. >> On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev > <llvm-dev at lists.llvm.org> wrote: > > I'm trying to use LLVM to create compiler-rt.o on Windows. I use this > > command from the compiler-rt project: > > > > [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S > > -emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows > > -DCRT_HAS_128BIT > > > > The resulting LLVM IR is: > > ================================================================> > > > ; ModuleID = 'lib/builtins/udivti3.c' > > source_filename = "lib/builtins/udivti3.c" > > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > > target triple = "x86_64--windows-msvc19.11.0" > > > > ; Function Attrs: noinline nounwind optnone uwtable > > define i128 @__udivti3(i128, i128) #0 { > > %3 = alloca i128, align 16 > > %4 = alloca i128, align 16 > > store i128 %1, i128* %3, align 16 > > store i128 %0, i128* %4, align 16 > > %5 = load i128, i128* %3, align 16 > > %6 = load i128, i128* %4, align 16 > > %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null) > > ret i128 %7 > > } > > > > declare i128 @__udivmodti4(i128, i128, i128*) #1 > > > > attributes #0 = { noinline nounwind optnone uwtable > > "correctly-rounded-divide-sqrt-fp-math"="false" > "disable-tail-calls"="false" > > "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" > > "no-infs-fp-math"="false" "no-jump-tables"="false" > "no-nans-fp-math"="false" > > "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" > > "stack-protector-buffer-size"="8" "target-cpu"="x86-64" > > "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" > > "use-soft-float"="false" } > > attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" > > "disable-tail-calls"="false" "less-precise-fpmad"="false" > > "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" > > "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" > > "no-trapping-math"="false" "stack-protector-buffer-size"="8" > > "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" > > "unsafe-fp-math"="false" "use-soft-float"="false" } > > > > !llvm.module.flags = !{!0, !1} > > !llvm.ident = !{!2} > > > > !0 = !{i32 1, !"wchar_size", i32 2} > > !1 = !{i32 7, !"PIC Level", i32 2} > > !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"} > > > > > > ================================================================> > However I think this results in a different ABI than LLVM will use when > you > > do i128 division. For example, here is my test case (in zig code): > > ================================================================> > > > pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) > noreturn; > > > > export fn WinMainCRTStartup() noreturn { > > @setAlignStack(16); > > @setRuntimeSafety(false); > > > > var a: u128 = 152313999999999991610955792383; > > var b: u128 = 10000000000000000000; > > var c = a / b; // this generates a call to __udivti3 > > > > if (c != b) { > > @breakpoint(); > > } > > ExitProcess(0); > > } > > > > export fn __udivti3(a: u128, b: u128) u128 { > > @setRuntimeSafety(false); > > return b; > > } > > > > > > ================================================================> > This results in this LLVM IR: > > ================================================================> > > > ; ModuleID = 'test' > > source_filename = "test" > > target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" > > target triple = "x86_64-pc-windows-msvc" > > > > %"[]u8" = type { i8*, i64 } > > %StackTrace = type { i64, %"[]usize" } > > %"[]usize" = type { i64*, i64 } > > > > ; Function Attrs: nounwind readnone speculatable > > declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 > > > > ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable > > alignstack(16) > > define void @WinMainCRTStartup() #2 !dbg !41 { > > Entry: > > %a = alloca i128, align 8 > > %b = alloca i128, align 8 > > %c = alloca i128, align 8 > > store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52 > > call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata > > !DIExpression()), !dbg !52 > > store i128 10000000000000000000, i128* %b, align 8, !dbg !53 > > call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata > > !DIExpression()), !dbg !53 > > %0 = load i128, i128* %a, align 8, !dbg !54 > > %1 = load i128, i128* %b, align 8, !dbg !55 > > %2 = udiv i128 %0, %1, !dbg !56 > > store i128 %2, i128* %c, align 8, !dbg !57 > > call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata > > !DIExpression()), !dbg !57 > > %3 = load i128, i128* %c, align 8, !dbg !58 > > %4 = load i128, i128* %b, align 8, !dbg !60 > > %5 = icmp ne i128 %3, %4, !dbg !61 > > br i1 %5, label %Then, label %Else, !dbg !61 > > > > Then: ; preds = %Entry > > call void @llvm.debugtrap(), !dbg !62 > > br label %EndIf, !dbg !64 > > > > Else: ; preds = %Entry > > br label %EndIf, !dbg !64 > > > > EndIf: ; preds = %Else, %Then > > call void @ExitProcess(i32 0), !dbg !65 > > unreachable, !dbg !65 > > } > > > > ; Function Attrs: nounwind > > declare void @llvm.debugtrap() #3 > > > > ; Function Attrs: nobuiltin noreturn nounwind uwtable > > declare void @ExitProcess(i32) #0 > > > > ; Function Attrs: nobuiltin nounwind uwtable > > define i128 @__udivti3(i128, i128) #4 !dbg !66 { > > Entry: > > %a = alloca i128, align 8 > > %b = alloca i128, align 8 > > store i128 %0, i128* %a, align 8 > > call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata > > !DIExpression()), !dbg !73 > > store i128 %1, i128* %b, align 8 > > call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata > > !DIExpression()), !dbg !74 > > %2 = load i128, i128* %b, align 8, !dbg !75 > > ret i128 %2, !dbg !78 > > } > > > > ; Function Attrs: nounwind > > declare void @llvm.stackprotector(i8*, i8**) #3 > > > > attributes #0 = { nobuiltin noreturn nounwind uwtable > > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > > attributes #1 = { nounwind readnone speculatable } > > attributes #2 = { nobuiltin noinline noreturn nounwind uwtable > alignstack=16 > > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } > > attributes #3 = { nounwind } > > attributes #4 = { nobuiltin nounwind uwtable > "no-frame-pointer-elim"="true" > > "no-frame-pointer-elim-non-leaf" } > > > > !llvm.module.flags = !{!0} > > !llvm.dbg.cu = !{!1} > > > > ================================================================> > > > When I link this with (link.exe or LLD, it does not matter): > > link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj > /subsystem:console > > kernel32.lib /nologo > > > > And run it, it triggers the breakpoint. > > > > Meanwhile on linux, this test passes. > > > > I suspect it may be a calling convention issue. Here is the assembly for > the > > linux x86_64 version: > > > > > > ================================================================> > 0000000000000010 <_start>: > > 10: 55 push %rbp > > 11: 48 89 e5 mov %rsp,%rbp > > 14: 48 83 ec 40 sub $0x40,%rsp > > 18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > > 1f: 00 00 00 > > 22: 48 89 45 f8 mov %rax,-0x8(%rbp) > > 26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > > 2d: 77 73 ff > > 30: 48 89 45 f0 mov %rax,-0x10(%rbp) > > 34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > > 3b: 23 c7 8a > > 3e: 48 89 45 e0 mov %rax,-0x20(%rbp) > > 42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > > 49: 00 > > 4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi > > 4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi > > 52: 48 8b 55 e0 mov -0x20(%rbp),%rdx > > 56: 48 8b 4d e8 mov -0x18(%rbp),%rcx > > 5a: e8 00 00 00 00 callq 5f <_start+0x4f> > > 5f: 48 89 55 d8 mov %rdx,-0x28(%rbp) > > 63: 48 89 45 d0 mov %rax,-0x30(%rbp) > > 67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0 > > 6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1 > > 71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0 > > 75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d > > 79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d > > 80: 44 89 45 cc mov %r8d,-0x34(%rbp) > > 84: 74 06 je 8c <_start+0x7c> > > 86: eb 00 jmp 88 <_start+0x78> > > 88: eb 00 jmp 8a <_start+0x7a> > > 8a: eb fe jmp 8a <_start+0x7a> > > 8c: eb 00 jmp 8e <_start+0x7e> > > 8e: 48 83 c4 40 add $0x40,%rsp > > 92: 5d pop %rbp > > 93: c3 retq > > 94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1) > > 9b: 00 00 00 00 00 > > > > 00000000000000a0 <__udivti3>: > > a0: 55 push %rbp > > a1: 48 89 e5 mov %rsp,%rbp > > a4: 48 89 7d f0 mov %rdi,-0x10(%rbp) > > a8: 48 89 75 f8 mov %rsi,-0x8(%rbp) > > ac: 48 89 4d e8 mov %rcx,-0x18(%rbp) > > b0: 48 89 55 e0 mov %rdx,-0x20(%rbp) > > b4: 48 8b 45 e0 mov -0x20(%rbp),%rax > > b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx > > bc: 5d pop %rbp > > bd: c3 retq > > > > > > ================================================================> > > > And here is the assembly for the windows x86_64 version: > > > > > > ================================================================> > 0000000000000010 <_start>: > > 10: 55 push %rbp > > 11: 48 81 ec 80 00 00 00 sub $0x80,%rsp > > 18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp > > 1f: 00 > > 20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax > > 27: 00 00 00 > > 2a: 48 89 45 f8 mov %rax,-0x8(%rbp) > > 2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax > > 35: 77 73 ff > > 38: 48 89 45 f0 mov %rax,-0x10(%rbp) > > 3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax > > 43: 23 c7 8a > > 46: 48 89 45 e0 mov %rax,-0x20(%rbp) > > 4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp) > > 51: 00 > > 52: 48 8b 45 f0 mov -0x10(%rbp),%rax > > 56: 48 8b 4d f8 mov -0x8(%rbp),%rcx > > 5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx > > 5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8 > > 62: 48 89 4d c8 mov %rcx,-0x38(%rbp) > > 66: 48 89 45 c0 mov %rax,-0x40(%rbp) > > 6a: 4c 89 45 b8 mov %r8,-0x48(%rbp) > > 6e: 48 89 55 b0 mov %rdx,-0x50(%rbp) > > 72: 48 8d 4d c0 lea -0x40(%rbp),%rcx > > 76: 48 8d 55 b0 lea -0x50(%rbp),%rdx > > 7a: e8 41 00 00 00 callq c0 <__udivti3> > > 7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1 > > 84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp) > > 89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp) > > 8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0 > > 92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1 > > 96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0 > > 9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d > > 9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d > > a6: 44 89 4d ac mov %r9d,-0x54(%rbp) > > aa: 74 06 je b2 <_start+0xa2> > > ac: eb 00 jmp ae <_start+0x9e> > > ae: eb 00 jmp b0 <_start+0xa0> > > b0: eb fe jmp b0 <_start+0xa0> > > b2: eb 00 jmp b4 <_start+0xa4> > > b4: 48 81 c4 80 00 00 00 add $0x80,%rsp > > bb: 5d pop %rbp > > bc: c3 retq > > bd: 90 nop > > be: 90 nop > > bf: 90 nop > > > > 00000000000000c0 <__udivti3>: > > c0: 55 push %rbp > > c1: 48 83 ec 20 sub $0x20,%rsp > > c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp > > ca: 48 89 4d f0 mov %rcx,-0x10(%rbp) > > ce: 48 89 55 f8 mov %rdx,-0x8(%rbp) > > d2: 4c 89 4d e8 mov %r9,-0x18(%rbp) > > d6: 4c 89 45 e0 mov %r8,-0x20(%rbp) > > da: 48 8b 45 e0 mov -0x20(%rbp),%rax > > de: 48 8b 55 e8 mov -0x18(%rbp),%rdx > > e2: 48 83 c4 20 add $0x20,%rsp > > e6: 5d pop %rbp > > e7: c3 retq > > > > ================================================================> > > > > > Finally, my question: > > > > What is the correct LLVM IR to represent i128 values so that it will be > > compatible with the compiler-rt calls that LLVM generates? For example, > what > > should be the LLVM IR definition of __udivti3? > > > > Because even though clang/compiler-rt project generates `define i128 > > @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on > > windows. > > > > Thanks, > > Andrew > > > > _______________________________________________ > > LLVM Developers mailing list > > llvm-dev at lists.llvm.org > > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev > > > > > > -- > With best regards, Anton Korobeynikov > Department of Statistical Modelling, Saint Petersburg State University >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180426/b98506cf/attachment.html>
Possibly Parallel Threads
- windows ABI problem with i128?
- windows ABI problem with i128?
- error: couldn't allocate input reg for constraint '{xmm0}'
- [LLVMdev] Fail to load a pointer to a function inside MCJIT-ed code when it is reload from ObjectCache
- [cfe-dev] How to debug if LTO generate wrong code?