I don't know how to file a PR, but I have a patch (see below), that
should work regardless of abi differences, since it relies on the
compiler to do the though job.
void X86CompilationCallback_SSE(void) {
char * SAVEBUF= (char*) alloca(64+12); // alloca is 16byte aligned
asm volatile (
"movl %%eax,(%0)\n"
"movl %%edx,4(%0)\n" // Save EAX/EDX/ECX
"movl %%ecx,8(%0)\n"
:: "r"(SAVEBUF+64): "memory" );
asm volatile (
// Save all XMM arg registers
"movaps %%xmm0, (%0)\n"
"movaps %%xmm1, 16(%0)\n"
"movaps %%xmm2, 32(%0)\n"
"movaps %%xmm3, 48(%0)\n"
:: "r"(SAVEBUF) : "memory" );
intptr_t *StackPtr=0, RetAddr=0;
asm volatile ( // get stack ptr and retaddr
"movl %%ebp,%0\n"
"movl 4(%%ebp),%1\n"
:"=r"(StackPtr), "=r"(RetAddr) :: "memory" );
X86CompilationCallback2(StackPtr,RetAddr); // gcc knows how to
call this according to the ABI
asm volatile ( // restore XMM arg registers
"movaps 48(%0), %%xmm3\n"
"movaps 32(%0), %%xmm2\n"
"movaps 16(%0), %%xmm1\n"
"movaps (%0), %%xmm0\n"
:: "r"(SAVEBUF) : "memory" );
asm volatile (
"movl (%0),%%eax\n"
"movl 4(%0),%%edx\n" // Restore EAX/EDX/ECX
"movl 8(%0),%%ecx\n"
:: "r"(SAVEBUF+64): "memory" );
}
The generated code is as follows:
Dump of assembler code for function X86CompilationCallback_SSE:
0xb74b98e0 <X86CompilationCallback_SSE+0>: push %ebp
0xb74b98e1 <X86CompilationCallback_SSE+1>: mov %esp,%ebp
0xb74b98e3 <X86CompilationCallback_SSE+3>: sub $0x78,%esp
0xb74b98e6 <X86CompilationCallback_SSE+6>: mov %esi,-0x8(%ebp)
0xb74b98e9 <X86CompilationCallback_SSE+9>: lea 0x17(%esp),%esi
0xb74b98ed <X86CompilationCallback_SSE+13>: and $0xfffffff0,%esi
0xb74b98f0 <X86CompilationCallback_SSE+16>: mov %ebx,-0xc(%ebp)
0xb74b98f3 <X86CompilationCallback_SSE+19>: mov %edi,-0x4(%ebp)
0xb74b98f6 <X86CompilationCallback_SSE+22>: lea 0x40(%esi),%edi
0xb74b98f9 <X86CompilationCallback_SSE+25>: call 0xb7315577
<__i686.get_pc_thunk.bx>
0xb74b98fe <X86CompilationCallback_SSE+30>: add $0x76d71e,%ebx
0xb74b9904 <X86CompilationCallback_SSE+36>: mov %eax,(%edi)
0xb74b9906 <X86CompilationCallback_SSE+38>: mov %edx,0x4(%edi)
0xb74b9909 <X86CompilationCallback_SSE+41>: mov %ecx,0x8(%edi)
0xb74b990c <X86CompilationCallback_SSE+44>: movaps %xmm0,(%esi)
0xb74b990f <X86CompilationCallback_SSE+47>: movaps %xmm1,0x10(%esi)
0xb74b9913 <X86CompilationCallback_SSE+51>: movaps %xmm2,0x20(%esi)
0xb74b9917 <X86CompilationCallback_SSE+55>: movaps %xmm3,0x30(%esi)
0xb74b991b <X86CompilationCallback_SSE+59>: mov %ebp,%edx
0xb74b991d <X86CompilationCallback_SSE+61>: mov 0x4(%ebp),%eax
0xb74b9920 <X86CompilationCallback_SSE+64>: mov %eax,0x4(%esp)
0xb74b9924 <X86CompilationCallback_SSE+68>: mov %edx,(%esp)
0xb74b9927 <X86CompilationCallback_SSE+71>: call 0xb7303348
<X86CompilationCallback2 at plt>
0xb74b992c <X86CompilationCallback_SSE+76>: movaps 0x30(%esi),%xmm3
0xb74b9930 <X86CompilationCallback_SSE+80>: movaps 0x20(%esi),%xmm2
0xb74b9934 <X86CompilationCallback_SSE+84>: movaps 0x10(%esi),%xmm1
0xb74b9938 <X86CompilationCallback_SSE+88>: movaps (%esi),%xmm0
0xb74b993b <X86CompilationCallback_SSE+91>: mov (%edi),%eax
0xb74b993d <X86CompilationCallback_SSE+93>: mov 0x4(%edi),%edx
0xb74b9940 <X86CompilationCallback_SSE+96>: mov 0x8(%edi),%ecx
0xb74b9943 <X86CompilationCallback_SSE+99>: mov -0xc(%ebp),%ebx
0xb74b9946 <X86CompilationCallback_SSE+102>: mov -0x8(%ebp),%esi
0xb74b9949 <X86CompilationCallback_SSE+105>: mov -0x4(%ebp),%edi
0xb74b994c <X86CompilationCallback_SSE+108>: mov %ebp,%esp
0xb74b994e <X86CompilationCallback_SSE+110>: pop %ebp
0xb74b994f <X86CompilationCallback_SSE+111>: ret
End of assembler dump.
And I verified that it works in my use case.
Clearly the same should be done for other asm functions in that same
file (e.g. the non-sse case).
Corrado