Stefan Kanthak via llvm-dev
2019-Jan-14 18:59 UTC
[llvm-dev] Where's the optimiser gone (part 10): sptting a cookie
Compile with -O3 -m32, or generate an assembly listing of __divdi3 and __moddi3 as shipped in clang_rt.builtins-i386.lib unsigned long long __udivmoddi4(unsigned long long numerator, unsigned long long denominator, unsigned long long *remainder); long long __moddi3(long long dividend, long long divisor) { long long r = divisor >> 63; // r = divisor < 0 ? -1 : 0 long long s = dividend >> 63; // s = dividend < 0 ? -1 : 0 divisor = (divisor ^ r) - r; // negate if divisor < 0 dividend = (dividend ^ s) - s; // negate if dividend < 0 __udivmoddi4(dividend, divisor, (unsigned long long *) &r); return (r ^ s) - s; // negate if dividend < 0 } ___moddi3: 00: 55 push ebp | 01: 89 E5 mov ebp, esp | 03: 53 push ebx | push ebx 04: 57 push edi | 05: 56 push esi | 06: 83 E4 F8 and esp, 0FFFFFFF8h | 09: 83 EC 10 sub esp, 10h | sub esp, 8 0C: 8B 45 14 mov eax, [ebp+14h] | mov eax, [esp+28] 0F: 8B 55 10 mov edx, [ebp+10h] | mov ecx, [esp+24] 12: 8B 35 00 00 00 00 mov esi, [___security_cookie] | 18: 89 E7 mov edi, esp | push esp 1A: 89 C1 mov ecx, eax | 1C: C1 F9 1F sar ecx, 1Fh | cdq 1F: 01 CA add edx, ecx | xor ecx, edx 21: 11 C8 adc eax, ecx | xor eax, edx 23: 31 CA xor edx, ecx | sub ecx, edx 25: 31 EE xor esi, ebp | 27: 31 C8 xor eax, ecx | sbb eax, edx | push eax | push ecx 29: 8B 4D 0C mov ecx, [ebp+0Ch] | mov eax, [esp+32] 2C: 89 74 24 08 mov [esp+8],esi | 30: 8B 75 08 mov esi, [ebp+8] | mov ecx, [esp+28] 33: 89 CB mov ebx, ecx | cdq 35: C1 FB 1F sar ebx, 1Fh | mov ebx, edx 38: 31 DE xor esi, ebx | xor ecx, edx 3A: 31 D9 xor ecx, ebx | xor eax, edx 3C: 29 DE sub esi, ebx | sub ecx, edx 3E: 19 D9 sbb ecx, ebx | sbb eax, edx 40: 57 push edi | 41: 50 push eax | push eax 42: 52 push edx | 43: 51 push ecx | push ecx 44: 56 push esi | 45: E8 00 00 00 00 call ___udivmoddi4 | call ___udivmoddi4 4A: 83 C4 14 add esp, 14h | add esp, 20 4D: 8B 3C 24 mov edi, [esp] | 50: 8B 74 24 04 mov esi, [esp+4] | mov eax, [esp] 54: 8B 4C 24 08 mov ecx, [esp+8] | mov edx, [esp+4] 58: 31 DF xor edi, ebx | xor eax, ebx 5A: 31 DE xor esi, ebx | xor edx, ebx 5C: 29 DF sub edi, ebx | sub eax, ebx 5E: 19 DE sbb esi, ebx | sbb edx, ebx 60: 31 E9 xor ecx, ebp | 62: E8 00 00 00 00 call @__security_check_cookie at 4| 67: 89 F8 mov eax, edi | 69: 89 F2 mov edx, esi | 6B: 8D 65 F4 lea esp, [ebp-0Ch] | add esp, 8 6E: 5E pop esi | 6F: 5F pop edi | 70: 5B pop ebx | pop ebx 71: 5D pop ebp | 72: C3 ret | ret clang generates 51 instructions, 18 more than properly optimised code, tinkers with a stack cookie, although there is no array allocated on the stack, and clobbers registers EDI and ESI without necessity. long long __divdi3(long long dividend, long long divisor) { long long r = divisor >> 63; // r = divisor < 0 ? -1 : 0 long long s = dividend >> 63; // s = dividend < 0 ? -1 : 0 divisor = (divisor ^ r) - r; // negate if divisor < 0 dividend = (dividend ^ s) - s; // negate if dividend < 0 s ^= r; // sign of quotient // negate if quotient < 0 return (__udivmoddi4(dividend, divisor, 0) ^ s) - s; } __divdi3: # @__divdi3 push ebx | push ebx push edi | push esi | mov ecx, dword ptr [esp + 28] | mov eax, [esp+20] mov eax, dword ptr [esp + 20] | mov edi, dword ptr [esp + 24] | mov ecx, [esp+16] mov ebx, dword ptr [esp + 16] | mov edx, ecx | mov esi, eax | sar edx, 31 | cdq sar esi, 31 | mov ebx, edx xor edi, edx | xor ecx, edx xor ecx, edx | xor eax, edx sub edi, edx | sub ecx, edx sbb ecx, edx | sbb eax, edx xor ebx, esi | xor eax, esi | sub ebx, esi | sbb eax, esi | xor esi, edx | sub esp, 12 # WTF? | push 0 | push 0 push ecx | push eax push edi | push ecx | mov eax, [esp+24] | mov ecx, [esp+20] | cdq | xor ecx, edx | xor eax, edx | sub ecx, edx | sbb eax, edx | xor ebx, edx push eax | push eax push ebx | push ecx call __udivmoddi4 | call __udivmoddi4 add esp, 32 | add esp, 20 xor eax, esi | xor eax, ebx xor edx, esi | xor edx, ebx sub eax, esi | sub eax, ebx sbb edx, esi | sbb edx, ebx pop esi | pop edi | pop ebx | pop ebx ret | ret clang generates 36 instructions, 6 more than properly optimised code, tinkers with ESP and clobbers registers EDI and ESI without necessity. stay tuned Stefan Kanthak