I don't know how to file a PR, but I have a patch (see below), that should work regardless of abi differences, since it relies on the compiler to do the though job. void X86CompilationCallback_SSE(void) { char * SAVEBUF= (char*) alloca(64+12); // alloca is 16byte aligned asm volatile ( "movl %%eax,(%0)\n" "movl %%edx,4(%0)\n" // Save EAX/EDX/ECX "movl %%ecx,8(%0)\n" :: "r"(SAVEBUF+64): "memory" ); asm volatile ( // Save all XMM arg registers "movaps %%xmm0, (%0)\n" "movaps %%xmm1, 16(%0)\n" "movaps %%xmm2, 32(%0)\n" "movaps %%xmm3, 48(%0)\n" :: "r"(SAVEBUF) : "memory" ); intptr_t *StackPtr=0, RetAddr=0; asm volatile ( // get stack ptr and retaddr "movl %%ebp,%0\n" "movl 4(%%ebp),%1\n" :"=r"(StackPtr), "=r"(RetAddr) :: "memory" ); X86CompilationCallback2(StackPtr,RetAddr); // gcc knows how to call this according to the ABI asm volatile ( // restore XMM arg registers "movaps 48(%0), %%xmm3\n" "movaps 32(%0), %%xmm2\n" "movaps 16(%0), %%xmm1\n" "movaps (%0), %%xmm0\n" :: "r"(SAVEBUF) : "memory" ); asm volatile ( "movl (%0),%%eax\n" "movl 4(%0),%%edx\n" // Restore EAX/EDX/ECX "movl 8(%0),%%ecx\n" :: "r"(SAVEBUF+64): "memory" ); } The generated code is as follows: Dump of assembler code for function X86CompilationCallback_SSE: 0xb74b98e0 <X86CompilationCallback_SSE+0>: push %ebp 0xb74b98e1 <X86CompilationCallback_SSE+1>: mov %esp,%ebp 0xb74b98e3 <X86CompilationCallback_SSE+3>: sub $0x78,%esp 0xb74b98e6 <X86CompilationCallback_SSE+6>: mov %esi,-0x8(%ebp) 0xb74b98e9 <X86CompilationCallback_SSE+9>: lea 0x17(%esp),%esi 0xb74b98ed <X86CompilationCallback_SSE+13>: and $0xfffffff0,%esi 0xb74b98f0 <X86CompilationCallback_SSE+16>: mov %ebx,-0xc(%ebp) 0xb74b98f3 <X86CompilationCallback_SSE+19>: mov %edi,-0x4(%ebp) 0xb74b98f6 <X86CompilationCallback_SSE+22>: lea 0x40(%esi),%edi 0xb74b98f9 <X86CompilationCallback_SSE+25>: call 0xb7315577 <__i686.get_pc_thunk.bx> 0xb74b98fe <X86CompilationCallback_SSE+30>: add $0x76d71e,%ebx 0xb74b9904 <X86CompilationCallback_SSE+36>: mov %eax,(%edi) 0xb74b9906 <X86CompilationCallback_SSE+38>: mov %edx,0x4(%edi) 0xb74b9909 <X86CompilationCallback_SSE+41>: mov %ecx,0x8(%edi) 0xb74b990c <X86CompilationCallback_SSE+44>: movaps %xmm0,(%esi) 0xb74b990f <X86CompilationCallback_SSE+47>: movaps %xmm1,0x10(%esi) 0xb74b9913 <X86CompilationCallback_SSE+51>: movaps %xmm2,0x20(%esi) 0xb74b9917 <X86CompilationCallback_SSE+55>: movaps %xmm3,0x30(%esi) 0xb74b991b <X86CompilationCallback_SSE+59>: mov %ebp,%edx 0xb74b991d <X86CompilationCallback_SSE+61>: mov 0x4(%ebp),%eax 0xb74b9920 <X86CompilationCallback_SSE+64>: mov %eax,0x4(%esp) 0xb74b9924 <X86CompilationCallback_SSE+68>: mov %edx,(%esp) 0xb74b9927 <X86CompilationCallback_SSE+71>: call 0xb7303348 <X86CompilationCallback2 at plt> 0xb74b992c <X86CompilationCallback_SSE+76>: movaps 0x30(%esi),%xmm3 0xb74b9930 <X86CompilationCallback_SSE+80>: movaps 0x20(%esi),%xmm2 0xb74b9934 <X86CompilationCallback_SSE+84>: movaps 0x10(%esi),%xmm1 0xb74b9938 <X86CompilationCallback_SSE+88>: movaps (%esi),%xmm0 0xb74b993b <X86CompilationCallback_SSE+91>: mov (%edi),%eax 0xb74b993d <X86CompilationCallback_SSE+93>: mov 0x4(%edi),%edx 0xb74b9940 <X86CompilationCallback_SSE+96>: mov 0x8(%edi),%ecx 0xb74b9943 <X86CompilationCallback_SSE+99>: mov -0xc(%ebp),%ebx 0xb74b9946 <X86CompilationCallback_SSE+102>: mov -0x8(%ebp),%esi 0xb74b9949 <X86CompilationCallback_SSE+105>: mov -0x4(%ebp),%edi 0xb74b994c <X86CompilationCallback_SSE+108>: mov %ebp,%esp 0xb74b994e <X86CompilationCallback_SSE+110>: pop %ebp 0xb74b994f <X86CompilationCallback_SSE+111>: ret End of assembler dump. And I verified that it works in my use case. Clearly the same should be done for other asm functions in that same file (e.g. the non-sse case). Corrado
On Mar 11, 2009, at 2:39 PM, Corrado Zoccolo wrote:> I don't know how to file a PR, but I have a patch (see below), that > should work regardless of abi differences, since it relies on the > compiler to do the though job. > > void X86CompilationCallback_SSE(void) { > char * SAVEBUF= (char*) alloca(64+12); // alloca is 16byte alignedHow do you ensure it's 16-byte aligned? Can you declare a local array and specify alignment using attribute __aligned? Evan> > asm volatile ( > "movl %%eax,(%0)\n" > "movl %%edx,4(%0)\n" // Save EAX/EDX/ECX > "movl %%ecx,8(%0)\n" > :: "r"(SAVEBUF+64): "memory" ); > > asm volatile ( > // Save all XMM arg registers > "movaps %%xmm0, (%0)\n" > "movaps %%xmm1, 16(%0)\n" > "movaps %%xmm2, 32(%0)\n" > "movaps %%xmm3, 48(%0)\n" > :: "r"(SAVEBUF) : "memory" ); > > intptr_t *StackPtr=0, RetAddr=0; > > asm volatile ( // get stack ptr and retaddr > "movl %%ebp,%0\n" > "movl 4(%%ebp),%1\n" > :"=r"(StackPtr), "=r"(RetAddr) :: "memory" ); > > X86CompilationCallback2(StackPtr,RetAddr); // gcc knows how to > call this according to the ABI > > asm volatile ( // restore XMM arg registers > "movaps 48(%0), %%xmm3\n" > "movaps 32(%0), %%xmm2\n" > "movaps 16(%0), %%xmm1\n" > "movaps (%0), %%xmm0\n" > :: "r"(SAVEBUF) : "memory" ); > > asm volatile ( > "movl (%0),%%eax\n" > "movl 4(%0),%%edx\n" // Restore EAX/EDX/ECX > "movl 8(%0),%%ecx\n" > :: "r"(SAVEBUF+64): "memory" ); > } > > The generated code is as follows: > > Dump of assembler code for function X86CompilationCallback_SSE: > 0xb74b98e0 <X86CompilationCallback_SSE+0>: push %ebp > 0xb74b98e1 <X86CompilationCallback_SSE+1>: mov %esp,%ebp > 0xb74b98e3 <X86CompilationCallback_SSE+3>: sub $0x78,%esp > 0xb74b98e6 <X86CompilationCallback_SSE+6>: mov %esi,-0x8(%ebp) > 0xb74b98e9 <X86CompilationCallback_SSE+9>: lea 0x17(%esp),%esi > 0xb74b98ed <X86CompilationCallback_SSE+13>: and > $0xfffffff0,%esi > 0xb74b98f0 <X86CompilationCallback_SSE+16>: mov %ebx,-0xc(%ebp) > 0xb74b98f3 <X86CompilationCallback_SSE+19>: mov %edi,-0x4(%ebp) > 0xb74b98f6 <X86CompilationCallback_SSE+22>: lea 0x40(%esi),%edi > 0xb74b98f9 <X86CompilationCallback_SSE+25>: call 0xb7315577 > <__i686.get_pc_thunk.bx> > 0xb74b98fe <X86CompilationCallback_SSE+30>: add $0x76d71e,%ebx > 0xb74b9904 <X86CompilationCallback_SSE+36>: mov %eax,(%edi) > 0xb74b9906 <X86CompilationCallback_SSE+38>: mov %edx,0x4(%edi) > 0xb74b9909 <X86CompilationCallback_SSE+41>: mov %ecx,0x8(%edi) > 0xb74b990c <X86CompilationCallback_SSE+44>: movaps %xmm0,(%esi) > 0xb74b990f <X86CompilationCallback_SSE+47>: movaps %xmm1,0x10 > (%esi) > 0xb74b9913 <X86CompilationCallback_SSE+51>: movaps %xmm2,0x20 > (%esi) > 0xb74b9917 <X86CompilationCallback_SSE+55>: movaps %xmm3,0x30 > (%esi) > 0xb74b991b <X86CompilationCallback_SSE+59>: mov %ebp,%edx > 0xb74b991d <X86CompilationCallback_SSE+61>: mov 0x4(%ebp),%eax > 0xb74b9920 <X86CompilationCallback_SSE+64>: mov %eax,0x4(%esp) > 0xb74b9924 <X86CompilationCallback_SSE+68>: mov %edx,(%esp) > 0xb74b9927 <X86CompilationCallback_SSE+71>: call 0xb7303348 > <X86CompilationCallback2 at plt> > 0xb74b992c <X86CompilationCallback_SSE+76>: movaps 0x30(%esi), > %xmm3 > 0xb74b9930 <X86CompilationCallback_SSE+80>: movaps 0x20(%esi), > %xmm2 > 0xb74b9934 <X86CompilationCallback_SSE+84>: movaps 0x10(%esi), > %xmm1 > 0xb74b9938 <X86CompilationCallback_SSE+88>: movaps (%esi),%xmm0 > 0xb74b993b <X86CompilationCallback_SSE+91>: mov (%edi),%eax > 0xb74b993d <X86CompilationCallback_SSE+93>: mov 0x4(%edi),%edx > 0xb74b9940 <X86CompilationCallback_SSE+96>: mov 0x8(%edi),%ecx > 0xb74b9943 <X86CompilationCallback_SSE+99>: mov -0xc(%ebp),%ebx > 0xb74b9946 <X86CompilationCallback_SSE+102>: mov -0x8(%ebp),%esi > 0xb74b9949 <X86CompilationCallback_SSE+105>: mov -0x4(%ebp),%edi > 0xb74b994c <X86CompilationCallback_SSE+108>: mov %ebp,%esp > 0xb74b994e <X86CompilationCallback_SSE+110>: pop %ebp > 0xb74b994f <X86CompilationCallback_SSE+111>: ret > End of assembler dump. > > And I verified that it works in my use case. > Clearly the same should be done for other asm functions in that same > file (e.g. the non-sse case). > > Corrado > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
On Thu, Mar 12, 2009 at 6:46 AM, Evan Cheng <echeng at apple.com> wrote:> > On Mar 11, 2009, at 2:39 PM, Corrado Zoccolo wrote: > >> I don't know how to file a PR, but I have a patch (see below), that >> should work regardless of abi differences, since it relies on the >> compiler to do the though job. >> >> void X86CompilationCallback_SSE(void) { >> char * SAVEBUF= (char*) alloca(64+12); // alloca is 16byte aligned > > How do you ensure it's 16-byte aligned? Can you declare a local array > and specify alignment using attribute __aligned? > > EvanWell, my gcc ensures it is properly alligned (following malloc specification), but alloca is not fully specified, so probably it is better to use the local array with the attribute to be sure that this will work also on different versions of gcc. -- __________________________________________________________________________ dott. Corrado Zoccolo mailto:czoccolo at gmail.com PhD - Department of Computer Science - University of Pisa, Italy --------------------------------------------------------------------------
This looks like an interesting idea. As written, the inline asms aren't safe though; they reference %eax, %edx, etc. without declaring such things in constraints, so the compiler wouldn't know that it can't clobber those registers. Dan On Mar 11, 2009, at 2:39 PM, Corrado Zoccolo wrote:> I don't know how to file a PR, but I have a patch (see below), that > should work regardless of abi differences, since it relies on the > compiler to do the though job. > > void X86CompilationCallback_SSE(void) { > char * SAVEBUF= (char*) alloca(64+12); // alloca is 16byte aligned > > asm volatile ( > "movl %%eax,(%0)\n" > "movl %%edx,4(%0)\n" // Save EAX/EDX/ECX > "movl %%ecx,8(%0)\n" > :: "r"(SAVEBUF+64): "memory" ); > > asm volatile ( > // Save all XMM arg registers > "movaps %%xmm0, (%0)\n" > "movaps %%xmm1, 16(%0)\n" > "movaps %%xmm2, 32(%0)\n" > "movaps %%xmm3, 48(%0)\n" > :: "r"(SAVEBUF) : "memory" ); > > intptr_t *StackPtr=0, RetAddr=0; > > asm volatile ( // get stack ptr and retaddr > "movl %%ebp,%0\n" > "movl 4(%%ebp),%1\n" > :"=r"(StackPtr), "=r"(RetAddr) :: "memory" ); > > X86CompilationCallback2(StackPtr,RetAddr); // gcc knows how to > call this according to the ABI > > asm volatile ( // restore XMM arg registers > "movaps 48(%0), %%xmm3\n" > "movaps 32(%0), %%xmm2\n" > "movaps 16(%0), %%xmm1\n" > "movaps (%0), %%xmm0\n" > :: "r"(SAVEBUF) : "memory" ); > > asm volatile ( > "movl (%0),%%eax\n" > "movl 4(%0),%%edx\n" // Restore EAX/EDX/ECX > "movl 8(%0),%%ecx\n" > :: "r"(SAVEBUF+64): "memory" ); > } > > The generated code is as follows: > > Dump of assembler code for function X86CompilationCallback_SSE: > 0xb74b98e0 <X86CompilationCallback_SSE+0>: push %ebp > 0xb74b98e1 <X86CompilationCallback_SSE+1>: mov %esp,%ebp > 0xb74b98e3 <X86CompilationCallback_SSE+3>: sub $0x78,%esp > 0xb74b98e6 <X86CompilationCallback_SSE+6>: mov %esi,-0x8(%ebp) > 0xb74b98e9 <X86CompilationCallback_SSE+9>: lea 0x17(%esp),%esi > 0xb74b98ed <X86CompilationCallback_SSE+13>: and > $0xfffffff0,%esi > 0xb74b98f0 <X86CompilationCallback_SSE+16>: mov %ebx,-0xc(%ebp) > 0xb74b98f3 <X86CompilationCallback_SSE+19>: mov %edi,-0x4(%ebp) > 0xb74b98f6 <X86CompilationCallback_SSE+22>: lea 0x40(%esi),%edi > 0xb74b98f9 <X86CompilationCallback_SSE+25>: call 0xb7315577 > <__i686.get_pc_thunk.bx> > 0xb74b98fe <X86CompilationCallback_SSE+30>: add $0x76d71e,%ebx > 0xb74b9904 <X86CompilationCallback_SSE+36>: mov %eax,(%edi) > 0xb74b9906 <X86CompilationCallback_SSE+38>: mov %edx,0x4(%edi) > 0xb74b9909 <X86CompilationCallback_SSE+41>: mov %ecx,0x8(%edi) > 0xb74b990c <X86CompilationCallback_SSE+44>: movaps %xmm0,(%esi) > 0xb74b990f <X86CompilationCallback_SSE+47>: movaps > %xmm1,0x10(%esi) > 0xb74b9913 <X86CompilationCallback_SSE+51>: movaps > %xmm2,0x20(%esi) > 0xb74b9917 <X86CompilationCallback_SSE+55>: movaps > %xmm3,0x30(%esi) > 0xb74b991b <X86CompilationCallback_SSE+59>: mov %ebp,%edx > 0xb74b991d <X86CompilationCallback_SSE+61>: mov 0x4(%ebp),%eax > 0xb74b9920 <X86CompilationCallback_SSE+64>: mov %eax,0x4(%esp) > 0xb74b9924 <X86CompilationCallback_SSE+68>: mov %edx,(%esp) > 0xb74b9927 <X86CompilationCallback_SSE+71>: call 0xb7303348 > <X86CompilationCallback2 at plt> > 0xb74b992c <X86CompilationCallback_SSE+76>: movaps 0x30(%esi), > %xmm3 > 0xb74b9930 <X86CompilationCallback_SSE+80>: movaps 0x20(%esi), > %xmm2 > 0xb74b9934 <X86CompilationCallback_SSE+84>: movaps 0x10(%esi), > %xmm1 > 0xb74b9938 <X86CompilationCallback_SSE+88>: movaps (%esi),%xmm0 > 0xb74b993b <X86CompilationCallback_SSE+91>: mov (%edi),%eax > 0xb74b993d <X86CompilationCallback_SSE+93>: mov 0x4(%edi),%edx > 0xb74b9940 <X86CompilationCallback_SSE+96>: mov 0x8(%edi),%ecx > 0xb74b9943 <X86CompilationCallback_SSE+99>: mov -0xc(%ebp),%ebx > 0xb74b9946 <X86CompilationCallback_SSE+102>: mov -0x8(%ebp),%esi > 0xb74b9949 <X86CompilationCallback_SSE+105>: mov -0x4(%ebp),%edi > 0xb74b994c <X86CompilationCallback_SSE+108>: mov %ebp,%esp > 0xb74b994e <X86CompilationCallback_SSE+110>: pop %ebp > 0xb74b994f <X86CompilationCallback_SSE+111>: ret > End of assembler dump. > > And I verified that it works in my use case. > Clearly the same should be done for other asm functions in that same > file (e.g. the non-sse case). > > Corrado > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
On Thu, Mar 12, 2009 at 5:18 PM, Dan Gohman <gohman at apple.com> wrote:> This looks like an interesting idea. As written, the inline asms > aren't safe > though; they reference %eax, %edx, etc. without declaring such things in > constraints, so the compiler wouldn't know that it can't clobber those > registers. > > Dan >I don't know how to specify the constraint, since it should constrain gcc to not use the register before the asm block, while constraints can just say how to pass info to/from the asm block, and describe how the asm block modifies the environment. Actually, I don't think it is needed, though, since those asm blocks are put at the very beginning and the very end of the function (and declared volatile, i.e. they cannot be moved), so the compiler has no need for those registers: * they are not used by standard prologue/epilogue * the only thing that is done before is the allocation of the stack space, and since we use it both before and after a call, the compiler must use one (or more) callee saved registers for it, so it can't use any of the registers we need to save. Corrado