I submitted the problem report to clang's bugzilla but no one seems to care so I have to send it to the mailing list. clang 3.7 svn (trunk 229055 as the time I was to report this problem) generates slower code than 3.5 (Apple LLVM version 6.0 (clang-600.0.56) (based on LLVM 3.5svn)) for the following code. It is a "8 queens puzzle" solver written as an educational example. As compiled by both clang 3.5 and 3.7, it gave the correct answer, but clang 3.5 generates code which runs 20% faster than 3.6/3.7. ########################################## # clang 3.5 which comes with Xcode 6.1.1 ########################################## $ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c $ time ./8 9 # 9 queens 352 solutions $ time ./8 10 # 10 queens ./8 9 1.63s user 0.00s system 99% cpu 1.632 total 724 solutions ./8 10 45.11s user 0.01s system 99% cpu 45.121 total ########################################## # clang 3.7 svn trunk ########################################## $ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c $ time ./8 9 # 9 queens 352 solutions ./8 9 2.07s user 0.00s system 99% cpu 2.078 total $ time ./8 10 # 10 queens 724 solutions ./8 10 56.63s user 0.02s system 99% cpu 56.650 total The source code is below, I also attached the executable files as well as the assembly code files for clang 3.5 and 3.6 by IDA. The performance is even worse when compiling as 32-bit code while gcc-4.9.2 is not affected. ########## clang-3.5 $ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c $ time ./8 9 352 solutions ./8 9 1.95s user 0.00s system 99% cpu 1.950 total ########## clang-3.7 $ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c $ time ./8 9 352 solutions ./8 9 2.48s user 0.00s system 99% cpu 2.480 total ######### gcc-4.9.2 $ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c $ time ./8 9 352 solutions ./8 9 1.44s user 0.00s system 99% cpu 1.442 total ``` #include <stdio.h> #include <stdlib.h> static inline int validate(int* a, int d) { int i, j, x; for (i = 0; i < d; ++i) { for (j = i+1, x = 1; j < d; ++j, ++x) { const int d = a[i] - a[j]; if (d == 0 || d == -x || d == x) return 0; } } return 1; } static inline int solve(int d) { int r = 0; int* a = (int*) calloc(sizeof(int), d+1); int p = d - 1; for (;;) { a[p]++; if (a[p] > d-1) { int bp = p - 1; while (bp >= 0) { a[bp]++; if (a[bp] <= d-1) break; a[bp] = 0; --bp; } if (bp < 0) break; a[p] = 0; } if (validate(a, d)) { ++r; } } free(a); return r; } int main(int argc, char** argv) { if (argc != 2) return -1; int r = solve((int) strtol(argv[1], NULL, 10)); printf("%d solutions\n", r); } ``` clang 3.5's result: ``` public _main _main proc near var_48 = qword ptr -48h var_40 = qword ptr -40h var_34 = dword ptr -34h push rbp push r15 push r14 push r13 push r12 push rbx sub rsp, 18h mov ebx, 0FFFFFFFFh cmp edi, 2 jnz loc_100000F29 mov rdi, [rsi+8] ; char * xor r14d, r14d xor esi, esi ; char ** mov edx, 0Ah ; int call _strtol mov r15, rax shl rax, 20h mov rsi, offset __mh_execute_header add rsi, rax sar rsi, 20h ; size_t mov edi, 4 ; size_t call _calloc lea edx, [r15-1] movsxd r8, edx mov ecx, r15d add ecx, 0FFFFFFFEh js loc_100000DFA test r15d, r15d mov r11d, [rax+r8*4] jle loc_100000EAE mov ecx, r15d add ecx, 0FFFFFFFEh mov [rsp+48h+var_34], ecx movsxd rcx, ecx lea rcx, [rax+rcx*4] mov [rsp+48h+var_40], rcx lea rcx, [rax+4] mov [rsp+48h+var_48], rcx xor r14d, r14d jmp short loc_100000D33 ; --------------------------------------------------------------------------- align 10h loc_100000D30: ; CODE XREF: _main+129 j ; _main+131 j ... add r14d, ebx loc_100000D33: ; CODE XREF: _main+92 j cmp r11d, edx lea edi, [r11+1] mov [rax+r8*4], edi mov rcx, [rsp+48h+var_40] mov esi, [rsp+48h+var_34] mov r11d, edi jl short loc_100000D84 nop dword ptr [rax+00h] loc_100000D50: ; CODE XREF: _main+DA j mov edi, [rcx] lea ebp, [rdi+1] mov [rcx], ebp cmp edi, edx jl short loc_100000D71 mov dword ptr [rcx], 0 add rcx, 0FFFFFFFFFFFFFFFCh test esi, esi lea esi, [rsi-1] jg short loc_100000D50 jmp loc_100000F0E ; --------------------------------------------------------------------------- loc_100000D71: ; CODE XREF: _main+C9 j test esi, esi js loc_100000F0E mov dword ptr [rax+r8*4], 0 xor r11d, r11d loc_100000D84: ; CODE XREF: _main+BA j cmp r15d, 1 mov esi, 0 mov r9, [rsp+48h+var_48] mov r12d, 1 jle short loc_100000DF0 loc_100000D99: ; CODE XREF: _main+15E j mov r10d, [rax+rsi*4] mov ecx, 0FFFFFFFFh mov edi, 1 mov r13, r9 nop word ptr [rax+rax+00h] loc_100000DB0: ; CODE XREF: _main+14F j xor ebx, ebx mov ebp, r10d sub ebp, [r13+0] jz loc_100000D30 cmp ecx, ebp jz loc_100000D30 cmp edi, ebp jz loc_100000D30 add r13, 4 inc rdi dec ecx mov ebx, edi add ebx, esi cmp ebx, r15d jl short loc_100000DB0 inc r12 add r9, 4 inc rsi cmp r12d, r15d jl short loc_100000D99 loc_100000DF0: ; CODE XREF: _main+107 j mov ebx, 1 jmp loc_100000D30 ; --------------------------------------------------------------------------- loc_100000DFA: ; CODE XREF: _main+5E j mov ecx, [rax+r8*4] lea r9d, [rcx+1] mov [rax+r8*4], r9d cmp ecx, r8d jge loc_100000F0E lea r12, [rax+4] xor r14d, r14d db 2Eh nop word ptr [rax+rax+00000000h] loc_100000E20: ; CODE XREF: _main+216 j test r15d, r15d setle cl cmp r15d, 2 jl short loc_100000E90 test cl, cl mov r13d, 0 mov r11, r12 mov r10d, 1 jnz short loc_100000E90 loc_100000E3F: ; CODE XREF: _main+1F0 j mov edi, [rax+r13*4] mov edx, 0FFFFFFFFh mov ecx, 1 mov rsi, r11 loc_100000E50: ; CODE XREF: _main+1E1 j xor ebx, ebx mov ebp, edi sub ebp, [rsi] jz short loc_100000E95 cmp edx, ebp jz short loc_100000E95 cmp ecx, ebp jz short loc_100000E95 add rsi, 4 inc rcx dec edx mov ebx, ecx add ebx, r13d cmp ebx, r15d jl short loc_100000E50 inc r10 add r11, 4 inc r13 cmp r10d, r15d jl short loc_100000E3F db 66h, 66h, 66h, 66h, 2Eh nop word ptr [rax+rax+00000000h] loc_100000E90: ; CODE XREF: _main+19A j ; _main+1AD j mov ebx, 1 loc_100000E95: ; CODE XREF: _main+1C6 j ; _main+1CA j ... add r14d, ebx cmp r9d, r8d lea ecx, [r9+1] mov [rax+r8*4], ecx mov r9d, ecx jl loc_100000E20 jmp short loc_100000F0E ; --------------------------------------------------------------------------- loc_100000EAE: ; CODE XREF: _main+6B j add r15d, 0FFFFFFFEh movsxd rcx, r15d lea rcx, [rax+rcx*4] xor r14d, r14d jmp short loc_100000EC6 ; --------------------------------------------------------------------------- align 20h loc_100000EC0: ; CODE XREF: _main+247 j ; _main+27C j inc r14d mov r11d, ebp loc_100000EC6: ; CODE XREF: _main+22C j lea ebp, [r11+1] mov [rax+r8*4], ebp cmp r11d, r8d mov rsi, rcx mov edi, r15d jl short loc_100000EC0 nop dword ptr [rax+00000000h] loc_100000EE0: ; CODE XREF: _main+26A j mov ebp, [rsi] lea ebx, [rbp+1] mov [rsi], ebx cmp ebp, edx jl short loc_100000EFE mov dword ptr [rsi], 0 add rsi, 0FFFFFFFFFFFFFFFCh test edi, edi lea edi, [rdi-1] jg short loc_100000EE0 jmp short loc_100000F0E ; --------------------------------------------------------------------------- loc_100000EFE: ; CODE XREF: _main+259 j test edi, edi js short loc_100000F0E mov dword ptr [rax+r8*4], 0 xor ebp, ebp jmp short loc_100000EC0 ; --------------------------------------------------------------------------- loc_100000F0E: ; CODE XREF: _main+DC j ; _main+E3 j ... mov rdi, rax ; void * call _free lea rdi, aDSolutions ; "%d solutions\n" xor ebx, ebx xor eax, eax mov esi, r14d call _printf loc_100000F29: ; CODE XREF: _main+16 j mov eax, ebx add rsp, 18h pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp retn _main endp ``` clang 3.6's result: ``` public _main _main proc near var_60 = qword ptr -60h var_58 = qword ptr -58h var_50 = qword ptr -50h var_48 = qword ptr -48h var_40 = qword ptr -40h var_38 = qword ptr -38h push rbp push r15 push r14 push r13 push r12 push rbx sub rsp, 38h mov ebx, 0FFFFFFFFh cmp edi, 2 jnz loc_100000F23 mov rbx, offset __mh_execute_header mov rdi, [rsi+8] ; char * xor r13d, r13d xor esi, esi ; char ** mov edx, 0Ah ; int call _strtol mov r14, rax shl rax, 20h mov [rsp+68h+var_38], rax lea rsi, [rax+rbx] sar rsi, 20h ; size_t mov edi, 4 ; size_t call _calloc lea r11d, [r14-1] movsxd r12, r11d mov [rsp+68h+var_40], r12 movsxd rcx, r14d mov [rsp+68h+var_50], rcx add ecx, 0FFFFFFFEh js loc_100000E1A mov ecx, r14d add ecx, 0FFFFFFFEh movsxd rcx, ecx inc rcx mov [rsp+68h+var_58], rcx mov rcx, rax add rcx, 4 mov [rsp+68h+var_60], rcx xor ebp, ebp jmp short loc_100000D17 ; --------------------------------------------------------------------------- align 10h loc_100000D10: ; CODE XREF: _main+15B j ; _main+163 j ... mov rbp, [rsp+68h+var_48] add ebp, edi loc_100000D17: ; CODE XREF: _main+93 j cmp r13d, r11d lea edx, [r13+1] mov [rax+r12*4], edx mov rcx, [rsp+68h+var_58] mov r13d, edx jl short loc_100000D6B nop dword ptr [rax+00h] loc_100000D30: ; CODE XREF: _main+DE j mov edx, [rax+rcx*4-4] lea esi, [rdx+1] mov [rax+rcx*4-4], esi cmp edx, r11d jl short loc_100000D60 mov dword ptr [rax+rcx*4-4], 0 dec rcx test rcx, rcx jg short loc_100000D30 jmp loc_100000F09 ; --------------------------------------------------------------------------- align 20h loc_100000D60: ; CODE XREF: _main+CE j mov dword ptr [rax+r12*4], 0 xor r13d, r13d loc_100000D6B: ; CODE XREF: _main+BA j mov [rsp+68h+var_48], rbp test r14d, r14d setle cl mov rdx, offset __mh_execute_header lea rdx, [rdx+1] cmp [rsp+68h+var_38], rdx jl loc_100000E10 test cl, cl mov edx, 0 mov r10, [rsp+68h+var_60] mov r9d, 1 jnz short loc_100000E10 loc_100000DA3: ; CODE XREF: _main+195 j mov esi, [rax+rdx*4] mov r15d, 0FFFFFFFFh mov r8d, 1 mov rcx, r10 db 66h, 66h, 2Eh nop dword ptr [rax+rax+00000000h] loc_100000DC0: ; CODE XREF: _main+184 j mov ebx, [rcx] mov ebp, esi sub ebp, ebx xor edi, edi cmp r8d, ebp jz loc_100000D10 cmp esi, ebx jz loc_100000D10 cmp r15d, ebp jz loc_100000D10 add rcx, 4 inc r8 dec r15d mov edi, r8d add edi, edx cmp edi, r14d jl short loc_100000DC0 inc r9 add r10, 4 inc rdx cmp r9, [rsp+68h+var_50] jl short loc_100000DA3 nop word ptr [rax+rax+00000000h] loc_100000E10: ; CODE XREF: _main+119 j ; _main+131 j mov edi, 1 jmp loc_100000D10 ; --------------------------------------------------------------------------- loc_100000E1A: ; CODE XREF: _main+6E j test r14d, r14d jle loc_100000F00 mov dword ptr [rax+r12*4], 1 xor ebp, ebp cmp r14d, 2 jl loc_100000F09 mov rcx, rax add rcx, 4 mov [rsp+68h+var_48], rcx xor ebp, ebp mov r15d, 1 nop dword ptr [rax+rax+00h] loc_100000E50: ; CODE XREF: _main+288 j mov rbx, rbp mov rcx, offset __mh_execute_header cmp [rsp+68h+var_38], rcx mov edx, 0 mov r13, [rsp+68h+var_48] mov r8d, 1 mov r9d, 1 jle short loc_100000EE0 loc_100000E7A: ; CODE XREF: _main+25A j mov r12d, [rax+rdx*4] mov edi, 0FFFFFFFFh mov ecx, 1 mov rsi, r13 nop dword ptr [rax+rax+00h] loc_100000E90: ; CODE XREF: _main+249 j mov r10d, [rsi] mov ebp, r12d sub ebp, r10d xor r9d, r9d cmp ecx, ebp jz short loc_100000EE0 cmp r12d, r10d jz short loc_100000EE0 cmp edi, ebp jz short loc_100000EE0 add rsi, 4 inc rcx dec edi mov ebp, ecx add ebp, edx cmp ebp, r14d jl short loc_100000E90 inc r8 add r13, 4 inc rdx cmp r8, [rsp+68h+var_50] jl short loc_100000E7A mov r9d, 1 db 66h, 66h, 66h, 66h, 2Eh nop word ptr [rax+rax+00000000h] loc_100000EE0: ; CODE XREF: _main+208 j ; _main+22E j ... mov rbp, rbx add ebp, r9d cmp r15d, r11d lea ecx, [r15+1] mov rdx, [rsp+68h+var_40] mov [rax+rdx*4], ecx mov r15d, ecx jl loc_100000E50 jmp short loc_100000F09 ; --------------------------------------------------------------------------- loc_100000F00: ; CODE XREF: _main+1AD j xor ebp, ebp test r11d, r11d cmovns ebp, r11d loc_100000F09: ; CODE XREF: _main+E0 j ; _main+1C1 j ... mov rdi, rax ; void * call _free lea rdi, aDSolutions ; "%d solutions\n" xor ebx, ebx xor eax, eax mov esi, ebp call _printf loc_100000F23: ; CODE XREF: _main+16 j mov eax, ebx add rsp, 38h pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp retn _main endp ``` gcc-4.9.2's result: ``` _main proc near var_48 = qword ptr -48h var_40 = dword ptr -40h var_3C = dword ptr -3Ch cmp edi, 2 jz short loc_100000D69 or eax, 0FFFFFFFFh retn ; --------------------------------------------------------------------------- loc_100000D69: ; CODE XREF: _main+3 j push r15 mov edx, 0Ah ; int push r14 push r13 push r12 push rbp push rbx sub rsp, 18h mov rdi, [rsi+8] ; char * xor esi, esi ; char ** call _strtol mov edi, 4 ; size_t lea esi, [rax+1] mov r14, rax mov ebx, eax lea r15d, [r14-2] movsxd rsi, esi ; size_t call _calloc mov [rsp+48h+var_3C], 0 mov rdi, rax ; void * lea eax, [r14-1] cdqe lea r13, [rdi+rax*4] movsxd rax, r15d mov ebp, [r13+0] shl rax, 2 lea r12, [rdi+rax] lea rax, [rdi+rax-4] mov [rsp+48h+var_48], rax mov eax, r14d lea r14d, [r14+1] nop word ptr [rax+rax+00h] nop word ptr [rax+rax+00h] loc_100000DE0: ; CODE XREF: _main+12B j ; _main+155 j ... add ebp, 1 cmp ebx, ebp mov [r13+0], ebp jg short loc_100000E62 test r15d, r15d js short loc_100000E33 mov ecx, [r12] lea edx, [rcx+1] cmp ebx, edx mov [r12], edx jg short loc_100000E58 mov r8, r12 mov rcx, [rsp+48h+var_48] mov esi, r15d jmp short loc_100000E24 ; --------------------------------------------------------------------------- align 10h loc_100000E10: ; CODE XREF: _main+D1 j mov edx, [rcx] sub r8, 4 sub rcx, 4 add edx, 1 mov [rcx+4], edx cmp ebx, edx jg short loc_100000E58 loc_100000E24: ; CODE XREF: _main+A9 j sub esi, 1 mov dword ptr [r8], 0 cmp esi, 0FFFFFFFFh jnz short loc_100000E10 loc_100000E33: ; CODE XREF: _main+8E j call _free mov esi, [rsp+48h+var_3C] add rsp, 18h xor eax, eax pop rbx lea rdi, aDSolutions ; "%d solutions\n" pop rbp pop r12 pop r13 pop r14 pop r15 jmp _printf ; --------------------------------------------------------------------------- loc_100000E58: ; CODE XREF: _main+9D j ; _main+C2 j mov dword ptr [r13+0], 0 xor ebp, ebp loc_100000E62: ; CODE XREF: _main+89 j test ebx, ebx jle loc_100000EE6 lea r11, [rdi+8] xor r10d, r10d loc_100000E71: ; CODE XREF: _main+184 j add r10d, 1 cmp r10d, eax jz short loc_100000EE6 mov r8d, [r11-8] mov edx, r8d sub edx, [r11-4] add edx, 1 cmp edx, 2 jbe loc_100000DE0 mov r9d, r14d mov rcx, r11 mov edx, 1 mov [rsp+48h+var_40], r10d sub r9d, r10d jmp short loc_100000ED3 ; --------------------------------------------------------------------------- align 10h loc_100000EB0: ; CODE XREF: _main+179 j mov esi, r8d sub esi, [rcx] jz loc_100000DE0 mov r10d, esi add rcx, 4 add r10d, edx jz loc_100000DE0 cmp esi, edx jz loc_100000DE0 loc_100000ED3: ; CODE XREF: _main+144 j add edx, 1 cmp edx, r9d jnz short loc_100000EB0 mov r10d, [rsp+48h+var_40] add r11, 4 jmp short loc_100000E71 ; --------------------------------------------------------------------------- loc_100000EE6: ; CODE XREF: _main+104 j ; _main+118 j add [rsp+48h+var_3C], 1 jmp loc_100000DE0 _main endp ``` MSVC 10.0's result: ``` _main proc near ; CODE XREF: ___tmainCRTStartup+106 p var_80 = dword ptr -80h var_7C = dword ptr -7Ch var_78 = dword ptr -78h var_74 = dword ptr -74h var_70 = dword ptr -70h var_6C = dword ptr -6Ch var_68 = dword ptr -68h var_64 = dword ptr -64h var_60 = dword ptr -60h var_5C = dword ptr -5Ch argc = dword ptr 8 argv = dword ptr 0Ch envp = dword ptr 10h push ebp mov ebp, esp and esp, 0FFFFFF80h push esi push edi push ebx sub esp, 74h push 3 call sub_4080F0 add esp, 4 stmxcsr [esp+80h+var_80] or [esp+80h+var_80], 8000h ldmxcsr [esp+80h+var_80] cmp [ebp+argc], 2 jz short loc_40103A mov eax, 0FFFFFFFFh add esp, 74h pop ebx pop edi pop esi mov esp, ebp pop ebp retn ; --------------------------------------------------------------------------- loc_40103A: ; CODE XREF: _main+29 j call ds:GetTickCount mov esi, eax mov eax, [ebp+argv] push dword ptr [eax+4] ; char * call _atoi mov edi, eax lea eax, [edi+1] push eax ; size_t push 4 ; size_t call _calloc add esp, 0Ch mov ecx, [eax+edi*4-4] lea edx, [edi-1] mov [esp+80h+var_6C], ecx xor ebx, ebx mov [esp+80h+var_7C], ebx lea ecx, [eax+edi*4] mov [esp+80h+var_74], ecx lea ecx, [edi-2] mov [esp+80h+var_70], ecx mov [esp+80h+var_60], edx mov [esp+80h+var_80], esi mov ecx, [esp+80h+var_6C] loc_401087: ; CODE XREF: _main+142 j ; _main+193 j mov edx, [esp+80h+var_60] inc ecx mov [eax+edi*4-4], ecx cmp edi, [eax+edx*4] jg short loc_4010DC mov esi, [esp+80h+var_70] test esi, esi js short loc_4010CE xor edx, edx mov [esp+80h+var_78], eax xor ebx, ebx mov eax, [esp+80h+var_74] loc_4010A9: ; CODE XREF: _main+C8 j mov ecx, [eax+ebx*4-8] inc ecx cmp ecx, edi jl loc_40117A inc edx lea esi, [ebx+edi-3] mov dword ptr [eax+ebx*4-8], 0 dec ebx cmp edx, [esp+80h+var_60] jb short loc_4010A9 mov eax, [esp+80h+var_78] loc_4010CE: ; CODE XREF: _main+9B j ; _main+186 j test esi, esi jl short loc_401147 mov dword ptr [eax+edi*4-4], 0 xor ecx, ecx loc_4010DC: ; CODE XREF: _main+93 j test edi, edi jle short loc_40113E mov [esp+80h+var_6C], ecx xor edx, edx mov [esp+80h+var_5C], edi loc_4010EA: ; CODE XREF: _main+132 j lea ecx, [edx+1] mov ebx, ecx mov esi, ebx cmp ecx, [esp+80h+var_5C] jge short loc_401130 mov edx, [eax+edx*4] mov edi, 1 mov [esp+80h+var_64], esi mov [esp+80h+var_68], ecx loc_401107: ; CODE XREF: _main+122 j mov esi, [eax+ebx*4] cmp edx, esi jz short loc_40118B sub esi, edx mov ecx, esi neg ecx cmp edi, ecx jz short loc_40118B cmp esi, edi jz short loc_40118B inc ebx inc edi cmp ebx, [esp+80h+var_5C] jl short loc_401107 mov ecx, [esp+80h+var_68] mov esi, [esp+80h+var_64] cmp ecx, [esp+80h+var_5C] loc_401130: ; CODE XREF: _main+F5 j mov edx, esi jl short loc_4010EA xchg ax, ax mov ecx, [esp+80h+var_6C] mov edi, [esp+80h+var_5C] loc_40113E: ; CODE XREF: _main+DE j inc [esp+80h+var_7C] jmp loc_401087 ; --------------------------------------------------------------------------- loc_401147: ; CODE XREF: _main+D0 j mov ebx, [esp+80h+var_7C] mov esi, [esp+80h+var_80] push eax ; void * call _free add esp, 4 call ds:GetTickCount sub eax, esi push eax push ebx push offset aDSolutionsInDM ; "%d solutions in %d msecs.\n" call _printf xor eax, eax add esp, 80h pop ebx pop edi pop esi mov esp, ebp pop ebp retn ; --------------------------------------------------------------------------- loc_40117A: ; CODE XREF: _main+B0 j mov edx, [esp+80h+var_74] mov eax, [esp+80h+var_78] mov [edx+ebx*4-8], ecx jmp loc_4010CE ; --------------------------------------------------------------------------- loc_40118B: ; CODE XREF: _main+10C j ; _main+116 j ... mov ecx, [esp+80h+var_6C] mov edi, [esp+80h+var_5C] jmp loc_401087 _main endp ```
Jack Howarth
2015-Feb-13 19:47 UTC
[LLVMdev] trunk's optimizer generates slower code than 3.5
Also confirmed with the llvm 3.5.1 release and the llvm 3.6 release branch on x86_64-apple-darwin14... % clang-3.5 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c % time ./8 9 352 solutions 3.603u 0.002s 0:03.60 100.0% 0+0k 0+0io 2pf+0w % time ./8 10 724 solutions 104.217u 0.059s 1:44.30 99.9% 0+0k 0+0io 2pf+0w % clang-3.6 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c % time ./8 9 352 solutions 4.050u 0.001s 0:04.05 100.0% 0+0k 0+0io 2pf+0w % time ./8 10 724 solutions 114.808u 0.041s 1:54.86 99.9% 0+0k 0+0io 2pf+0w On Fri, Feb 13, 2015 at 3:37 AM, 191919 <191919 at gmail.com> wrote:> I submitted the problem report to clang's bugzilla but no one seems to > care so I have to send it to the mailing list. > > clang 3.7 svn (trunk 229055 as the time I was to report this problem) > generates slower code than 3.5 (Apple LLVM version 6.0 > (clang-600.0.56) (based on LLVM 3.5svn)) for the following code. > > It is a "8 queens puzzle" solver written as an educational example. As > compiled by both clang 3.5 and 3.7, it gave the correct answer, but > clang 3.5 generates code which runs 20% faster than 3.6/3.7. > > ########################################## > # clang 3.5 which comes with Xcode 6.1.1 > ########################################## > $ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 # 9 queens > 352 solutions > $ time ./8 10 # 10 queens > ./8 9 1.63s user 0.00s system 99% cpu 1.632 total > 724 solutions > ./8 10 45.11s user 0.01s system 99% cpu 45.121 total > > ########################################## > # clang 3.7 svn trunk > ########################################## > $ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 # 9 queens > 352 solutions > ./8 9 2.07s user 0.00s system 99% cpu 2.078 total > $ time ./8 10 # 10 queens > 724 solutions > ./8 10 56.63s user 0.02s system 99% cpu 56.650 total > > The source code is below, I also attached the executable files as well > as the assembly code files for clang 3.5 and 3.6 by IDA. > > The performance is even worse when compiling as 32-bit code while > gcc-4.9.2 is not affected. > > ########## clang-3.5 > $ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 > 352 solutions > ./8 9 1.95s user 0.00s system 99% cpu 1.950 total > > ########## clang-3.7 > $ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 > 352 solutions > ./8 9 2.48s user 0.00s system 99% cpu 2.480 total > > ######### gcc-4.9.2 > $ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 > 352 solutions > ./8 9 1.44s user 0.00s system 99% cpu 1.442 total > > > ``` > #include <stdio.h> > #include <stdlib.h> > > static inline int validate(int* a, int d) > { > int i, j, x; > for (i = 0; i < d; ++i) > { > for (j = i+1, x = 1; j < d; ++j, ++x) > { > const int d = a[i] - a[j]; > if (d == 0 || d == -x || d == x) return 0; > } > } > return 1; > } > > static inline int solve(int d) > { > int r = 0; > int* a = (int*) calloc(sizeof(int), d+1); > int p = d - 1; > > for (;;) > { > a[p]++; > > if (a[p] > d-1) > { > int bp = p - 1; > while (bp >= 0) > { > a[bp]++; > if (a[bp] <= d-1) break; > a[bp] = 0; > --bp; > } > if (bp < 0) > break; > a[p] = 0; > } > if (validate(a, d)) > { > ++r; > } > } > > free(a); > return r; > } > > int main(int argc, char** argv) > { > if (argc != 2) return -1; > int r = solve((int) strtol(argv[1], NULL, 10)); > printf("%d solutions\n", r); > } > ``` > > clang 3.5's result: > > ``` > public _main > _main proc near > > var_48 = qword ptr -48h > var_40 = qword ptr -40h > var_34 = dword ptr -34h > > push rbp > push r15 > push r14 > push r13 > push r12 > push rbx > sub rsp, 18h > mov ebx, 0FFFFFFFFh > cmp edi, 2 > jnz loc_100000F29 > mov rdi, [rsi+8] ; char * > xor r14d, r14d > xor esi, esi ; char ** > mov edx, 0Ah ; int > call _strtol > mov r15, rax > shl rax, 20h > mov rsi, offset __mh_execute_header > add rsi, rax > sar rsi, 20h ; size_t > mov edi, 4 ; size_t > call _calloc > lea edx, [r15-1] > movsxd r8, edx > mov ecx, r15d > add ecx, 0FFFFFFFEh > js loc_100000DFA > test r15d, r15d > mov r11d, [rax+r8*4] > jle loc_100000EAE > mov ecx, r15d > add ecx, 0FFFFFFFEh > mov [rsp+48h+var_34], ecx > movsxd rcx, ecx > lea rcx, [rax+rcx*4] > mov [rsp+48h+var_40], rcx > lea rcx, [rax+4] > mov [rsp+48h+var_48], rcx > xor r14d, r14d > jmp short loc_100000D33 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000D30: ; CODE XREF: _main+129 j > ; _main+131 j ... > add r14d, ebx > > loc_100000D33: ; CODE XREF: _main+92 j > cmp r11d, edx > lea edi, [r11+1] > mov [rax+r8*4], edi > mov rcx, [rsp+48h+var_40] > mov esi, [rsp+48h+var_34] > mov r11d, edi > jl short loc_100000D84 > nop dword ptr [rax+00h] > > loc_100000D50: ; CODE XREF: _main+DA j > mov edi, [rcx] > lea ebp, [rdi+1] > mov [rcx], ebp > cmp edi, edx > jl short loc_100000D71 > mov dword ptr [rcx], 0 > add rcx, 0FFFFFFFFFFFFFFFCh > test esi, esi > lea esi, [rsi-1] > jg short loc_100000D50 > jmp loc_100000F0E > ; --------------------------------------------------------------------------- > > loc_100000D71: ; CODE XREF: _main+C9 j > test esi, esi > js loc_100000F0E > mov dword ptr [rax+r8*4], 0 > xor r11d, r11d > > loc_100000D84: ; CODE XREF: _main+BA j > cmp r15d, 1 > mov esi, 0 > mov r9, [rsp+48h+var_48] > mov r12d, 1 > jle short loc_100000DF0 > > loc_100000D99: ; CODE XREF: _main+15E j > mov r10d, [rax+rsi*4] > mov ecx, 0FFFFFFFFh > mov edi, 1 > mov r13, r9 > nop word ptr [rax+rax+00h] > > loc_100000DB0: ; CODE XREF: _main+14F j > xor ebx, ebx > mov ebp, r10d > sub ebp, [r13+0] > jz loc_100000D30 > cmp ecx, ebp > jz loc_100000D30 > cmp edi, ebp > jz loc_100000D30 > add r13, 4 > inc rdi > dec ecx > mov ebx, edi > add ebx, esi > cmp ebx, r15d > jl short loc_100000DB0 > inc r12 > add r9, 4 > inc rsi > cmp r12d, r15d > jl short loc_100000D99 > > loc_100000DF0: ; CODE XREF: _main+107 j > mov ebx, 1 > jmp loc_100000D30 > ; --------------------------------------------------------------------------- > > loc_100000DFA: ; CODE XREF: _main+5E j > mov ecx, [rax+r8*4] > lea r9d, [rcx+1] > mov [rax+r8*4], r9d > cmp ecx, r8d > jge loc_100000F0E > lea r12, [rax+4] > xor r14d, r14d > db 2Eh > nop word ptr [rax+rax+00000000h] > > loc_100000E20: ; CODE XREF: _main+216 j > test r15d, r15d > setle cl > cmp r15d, 2 > jl short loc_100000E90 > test cl, cl > mov r13d, 0 > mov r11, r12 > mov r10d, 1 > jnz short loc_100000E90 > > loc_100000E3F: ; CODE XREF: _main+1F0 j > mov edi, [rax+r13*4] > mov edx, 0FFFFFFFFh > mov ecx, 1 > mov rsi, r11 > > loc_100000E50: ; CODE XREF: _main+1E1 j > xor ebx, ebx > mov ebp, edi > sub ebp, [rsi] > jz short loc_100000E95 > cmp edx, ebp > jz short loc_100000E95 > cmp ecx, ebp > jz short loc_100000E95 > add rsi, 4 > inc rcx > dec edx > mov ebx, ecx > add ebx, r13d > cmp ebx, r15d > jl short loc_100000E50 > inc r10 > add r11, 4 > inc r13 > cmp r10d, r15d > jl short loc_100000E3F > db 66h, 66h, 66h, 66h, 2Eh > nop word ptr [rax+rax+00000000h] > > loc_100000E90: ; CODE XREF: _main+19A j > ; _main+1AD j > mov ebx, 1 > > loc_100000E95: ; CODE XREF: _main+1C6 j > ; _main+1CA j ... > add r14d, ebx > cmp r9d, r8d > lea ecx, [r9+1] > mov [rax+r8*4], ecx > mov r9d, ecx > jl loc_100000E20 > jmp short loc_100000F0E > ; --------------------------------------------------------------------------- > > loc_100000EAE: ; CODE XREF: _main+6B j > add r15d, 0FFFFFFFEh > movsxd rcx, r15d > lea rcx, [rax+rcx*4] > xor r14d, r14d > jmp short loc_100000EC6 > ; --------------------------------------------------------------------------- > align 20h > > loc_100000EC0: ; CODE XREF: _main+247 j > ; _main+27C j > inc r14d > mov r11d, ebp > > loc_100000EC6: ; CODE XREF: _main+22C j > lea ebp, [r11+1] > mov [rax+r8*4], ebp > cmp r11d, r8d > mov rsi, rcx > mov edi, r15d > jl short loc_100000EC0 > nop dword ptr [rax+00000000h] > > loc_100000EE0: ; CODE XREF: _main+26A j > mov ebp, [rsi] > lea ebx, [rbp+1] > mov [rsi], ebx > cmp ebp, edx > jl short loc_100000EFE > mov dword ptr [rsi], 0 > add rsi, 0FFFFFFFFFFFFFFFCh > test edi, edi > lea edi, [rdi-1] > jg short loc_100000EE0 > jmp short loc_100000F0E > ; --------------------------------------------------------------------------- > > loc_100000EFE: ; CODE XREF: _main+259 j > test edi, edi > js short loc_100000F0E > mov dword ptr [rax+r8*4], 0 > xor ebp, ebp > jmp short loc_100000EC0 > ; --------------------------------------------------------------------------- > > loc_100000F0E: ; CODE XREF: _main+DC j > ; _main+E3 j ... > mov rdi, rax ; void * > call _free > lea rdi, aDSolutions ; "%d solutions\n" > xor ebx, ebx > xor eax, eax > mov esi, r14d > call _printf > > loc_100000F29: ; CODE XREF: _main+16 j > mov eax, ebx > add rsp, 18h > pop rbx > pop r12 > pop r13 > pop r14 > pop r15 > pop rbp > retn > _main endp > ``` > > clang 3.6's result: > > ``` > public _main > _main proc near > > var_60 = qword ptr -60h > var_58 = qword ptr -58h > var_50 = qword ptr -50h > var_48 = qword ptr -48h > var_40 = qword ptr -40h > var_38 = qword ptr -38h > > push rbp > push r15 > push r14 > push r13 > push r12 > push rbx > sub rsp, 38h > mov ebx, 0FFFFFFFFh > cmp edi, 2 > jnz loc_100000F23 > mov rbx, offset __mh_execute_header > mov rdi, [rsi+8] ; char * > xor r13d, r13d > xor esi, esi ; char ** > mov edx, 0Ah ; int > call _strtol > mov r14, rax > shl rax, 20h > mov [rsp+68h+var_38], rax > lea rsi, [rax+rbx] > sar rsi, 20h ; size_t > mov edi, 4 ; size_t > call _calloc > lea r11d, [r14-1] > movsxd r12, r11d > mov [rsp+68h+var_40], r12 > movsxd rcx, r14d > mov [rsp+68h+var_50], rcx > add ecx, 0FFFFFFFEh > js loc_100000E1A > mov ecx, r14d > add ecx, 0FFFFFFFEh > movsxd rcx, ecx > inc rcx > mov [rsp+68h+var_58], rcx > mov rcx, rax > add rcx, 4 > mov [rsp+68h+var_60], rcx > xor ebp, ebp > jmp short loc_100000D17 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000D10: ; CODE XREF: _main+15B j > ; _main+163 j ... > mov rbp, [rsp+68h+var_48] > add ebp, edi > > loc_100000D17: ; CODE XREF: _main+93 j > cmp r13d, r11d > lea edx, [r13+1] > mov [rax+r12*4], edx > mov rcx, [rsp+68h+var_58] > mov r13d, edx > jl short loc_100000D6B > nop dword ptr [rax+00h] > > loc_100000D30: ; CODE XREF: _main+DE j > mov edx, [rax+rcx*4-4] > lea esi, [rdx+1] > mov [rax+rcx*4-4], esi > cmp edx, r11d > jl short loc_100000D60 > mov dword ptr [rax+rcx*4-4], 0 > dec rcx > test rcx, rcx > jg short loc_100000D30 > jmp loc_100000F09 > ; --------------------------------------------------------------------------- > align 20h > > loc_100000D60: ; CODE XREF: _main+CE j > mov dword ptr [rax+r12*4], 0 > xor r13d, r13d > > loc_100000D6B: ; CODE XREF: _main+BA j > mov [rsp+68h+var_48], rbp > test r14d, r14d > setle cl > mov rdx, offset __mh_execute_header > lea rdx, [rdx+1] > cmp [rsp+68h+var_38], rdx > jl loc_100000E10 > test cl, cl > mov edx, 0 > mov r10, [rsp+68h+var_60] > mov r9d, 1 > jnz short loc_100000E10 > > loc_100000DA3: ; CODE XREF: _main+195 j > mov esi, [rax+rdx*4] > mov r15d, 0FFFFFFFFh > mov r8d, 1 > mov rcx, r10 > db 66h, 66h, 2Eh > nop dword ptr [rax+rax+00000000h] > > loc_100000DC0: ; CODE XREF: _main+184 j > mov ebx, [rcx] > mov ebp, esi > sub ebp, ebx > xor edi, edi > cmp r8d, ebp > jz loc_100000D10 > cmp esi, ebx > jz loc_100000D10 > cmp r15d, ebp > jz loc_100000D10 > add rcx, 4 > inc r8 > dec r15d > mov edi, r8d > add edi, edx > cmp edi, r14d > jl short loc_100000DC0 > inc r9 > add r10, 4 > inc rdx > cmp r9, [rsp+68h+var_50] > jl short loc_100000DA3 > nop word ptr [rax+rax+00000000h] > > loc_100000E10: ; CODE XREF: _main+119 j > ; _main+131 j > mov edi, 1 > jmp loc_100000D10 > ; --------------------------------------------------------------------------- > > loc_100000E1A: ; CODE XREF: _main+6E j > test r14d, r14d > jle loc_100000F00 > mov dword ptr [rax+r12*4], 1 > xor ebp, ebp > cmp r14d, 2 > jl loc_100000F09 > mov rcx, rax > add rcx, 4 > mov [rsp+68h+var_48], rcx > xor ebp, ebp > mov r15d, 1 > nop dword ptr [rax+rax+00h] > > loc_100000E50: ; CODE XREF: _main+288 j > mov rbx, rbp > mov rcx, offset __mh_execute_header > cmp [rsp+68h+var_38], rcx > mov edx, 0 > mov r13, [rsp+68h+var_48] > mov r8d, 1 > mov r9d, 1 > jle short loc_100000EE0 > > loc_100000E7A: ; CODE XREF: _main+25A j > mov r12d, [rax+rdx*4] > mov edi, 0FFFFFFFFh > mov ecx, 1 > mov rsi, r13 > nop dword ptr [rax+rax+00h] > > loc_100000E90: ; CODE XREF: _main+249 j > mov r10d, [rsi] > mov ebp, r12d > sub ebp, r10d > xor r9d, r9d > cmp ecx, ebp > jz short loc_100000EE0 > cmp r12d, r10d > jz short loc_100000EE0 > cmp edi, ebp > jz short loc_100000EE0 > add rsi, 4 > inc rcx > dec edi > mov ebp, ecx > add ebp, edx > cmp ebp, r14d > jl short loc_100000E90 > inc r8 > add r13, 4 > inc rdx > cmp r8, [rsp+68h+var_50] > jl short loc_100000E7A > mov r9d, 1 > db 66h, 66h, 66h, 66h, 2Eh > nop word ptr [rax+rax+00000000h] > > loc_100000EE0: ; CODE XREF: _main+208 j > ; _main+22E j ... > mov rbp, rbx > add ebp, r9d > cmp r15d, r11d > lea ecx, [r15+1] > mov rdx, [rsp+68h+var_40] > mov [rax+rdx*4], ecx > mov r15d, ecx > jl loc_100000E50 > jmp short loc_100000F09 > ; --------------------------------------------------------------------------- > > loc_100000F00: ; CODE XREF: _main+1AD j > xor ebp, ebp > test r11d, r11d > cmovns ebp, r11d > > loc_100000F09: ; CODE XREF: _main+E0 j > ; _main+1C1 j ... > mov rdi, rax ; void * > call _free > lea rdi, aDSolutions ; "%d solutions\n" > xor ebx, ebx > xor eax, eax > mov esi, ebp > call _printf > > loc_100000F23: ; CODE XREF: _main+16 j > mov eax, ebx > add rsp, 38h > pop rbx > pop r12 > pop r13 > pop r14 > pop r15 > pop rbp > retn > _main endp > ``` > > gcc-4.9.2's result: > ``` > > _main proc near > > var_48 = qword ptr -48h > var_40 = dword ptr -40h > var_3C = dword ptr -3Ch > > cmp edi, 2 > jz short loc_100000D69 > or eax, 0FFFFFFFFh > retn > ; --------------------------------------------------------------------------- > > loc_100000D69: ; CODE XREF: _main+3 j > push r15 > mov edx, 0Ah ; int > push r14 > push r13 > push r12 > push rbp > push rbx > sub rsp, 18h > mov rdi, [rsi+8] ; char * > xor esi, esi ; char ** > call _strtol > mov edi, 4 ; size_t > lea esi, [rax+1] > mov r14, rax > mov ebx, eax > lea r15d, [r14-2] > movsxd rsi, esi ; size_t > call _calloc > mov [rsp+48h+var_3C], 0 > mov rdi, rax ; void * > lea eax, [r14-1] > cdqe > lea r13, [rdi+rax*4] > movsxd rax, r15d > mov ebp, [r13+0] > shl rax, 2 > lea r12, [rdi+rax] > lea rax, [rdi+rax-4] > mov [rsp+48h+var_48], rax > mov eax, r14d > lea r14d, [r14+1] > nop word ptr [rax+rax+00h] > nop word ptr [rax+rax+00h] > > loc_100000DE0: ; CODE XREF: _main+12B j > ; _main+155 j ... > add ebp, 1 > cmp ebx, ebp > mov [r13+0], ebp > jg short loc_100000E62 > test r15d, r15d > js short loc_100000E33 > mov ecx, [r12] > lea edx, [rcx+1] > cmp ebx, edx > mov [r12], edx > jg short loc_100000E58 > mov r8, r12 > mov rcx, [rsp+48h+var_48] > mov esi, r15d > jmp short loc_100000E24 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000E10: ; CODE XREF: _main+D1 j > mov edx, [rcx] > sub r8, 4 > sub rcx, 4 > add edx, 1 > mov [rcx+4], edx > cmp ebx, edx > jg short loc_100000E58 > > loc_100000E24: ; CODE XREF: _main+A9 j > sub esi, 1 > mov dword ptr [r8], 0 > cmp esi, 0FFFFFFFFh > jnz short loc_100000E10 > > loc_100000E33: ; CODE XREF: _main+8E j > call _free > mov esi, [rsp+48h+var_3C] > add rsp, 18h > xor eax, eax > pop rbx > lea rdi, aDSolutions ; "%d solutions\n" > pop rbp > pop r12 > pop r13 > pop r14 > pop r15 > jmp _printf > ; --------------------------------------------------------------------------- > > loc_100000E58: ; CODE XREF: _main+9D j > ; _main+C2 j > mov dword ptr [r13+0], 0 > xor ebp, ebp > > loc_100000E62: ; CODE XREF: _main+89 j > test ebx, ebx > jle loc_100000EE6 > lea r11, [rdi+8] > xor r10d, r10d > > loc_100000E71: ; CODE XREF: _main+184 j > add r10d, 1 > cmp r10d, eax > jz short loc_100000EE6 > mov r8d, [r11-8] > mov edx, r8d > sub edx, [r11-4] > add edx, 1 > cmp edx, 2 > jbe loc_100000DE0 > mov r9d, r14d > mov rcx, r11 > mov edx, 1 > mov [rsp+48h+var_40], r10d > sub r9d, r10d > jmp short loc_100000ED3 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000EB0: ; CODE XREF: _main+179 j > mov esi, r8d > sub esi, [rcx] > jz loc_100000DE0 > mov r10d, esi > add rcx, 4 > add r10d, edx > jz loc_100000DE0 > cmp esi, edx > jz loc_100000DE0 > > loc_100000ED3: ; CODE XREF: _main+144 j > add edx, 1 > cmp edx, r9d > jnz short loc_100000EB0 > mov r10d, [rsp+48h+var_40] > add r11, 4 > jmp short loc_100000E71 > ; --------------------------------------------------------------------------- > > loc_100000EE6: ; CODE XREF: _main+104 j > ; _main+118 j > add [rsp+48h+var_3C], 1 > jmp loc_100000DE0 > _main endp > ``` > > MSVC 10.0's result: > > ``` > > _main proc near ; CODE XREF: ___tmainCRTStartup+106 p > > var_80 = dword ptr -80h > var_7C = dword ptr -7Ch > var_78 = dword ptr -78h > var_74 = dword ptr -74h > var_70 = dword ptr -70h > var_6C = dword ptr -6Ch > var_68 = dword ptr -68h > var_64 = dword ptr -64h > var_60 = dword ptr -60h > var_5C = dword ptr -5Ch > argc = dword ptr 8 > argv = dword ptr 0Ch > envp = dword ptr 10h > > push ebp > mov ebp, esp > and esp, 0FFFFFF80h > push esi > push edi > push ebx > sub esp, 74h > push 3 > call sub_4080F0 > add esp, 4 > stmxcsr [esp+80h+var_80] > or [esp+80h+var_80], 8000h > ldmxcsr [esp+80h+var_80] > cmp [ebp+argc], 2 > jz short loc_40103A > mov eax, 0FFFFFFFFh > add esp, 74h > pop ebx > pop edi > pop esi > mov esp, ebp > pop ebp > retn > ; --------------------------------------------------------------------------- > > loc_40103A: ; CODE XREF: _main+29 j > call ds:GetTickCount > mov esi, eax > mov eax, [ebp+argv] > push dword ptr [eax+4] ; char * > call _atoi > mov edi, eax > lea eax, [edi+1] > push eax ; size_t > push 4 ; size_t > call _calloc > add esp, 0Ch > mov ecx, [eax+edi*4-4] > lea edx, [edi-1] > mov [esp+80h+var_6C], ecx > xor ebx, ebx > mov [esp+80h+var_7C], ebx > lea ecx, [eax+edi*4] > mov [esp+80h+var_74], ecx > lea ecx, [edi-2] > mov [esp+80h+var_70], ecx > mov [esp+80h+var_60], edx > mov [esp+80h+var_80], esi > mov ecx, [esp+80h+var_6C] > > loc_401087: ; CODE XREF: _main+142 j > ; _main+193 j > mov edx, [esp+80h+var_60] > inc ecx > mov [eax+edi*4-4], ecx > cmp edi, [eax+edx*4] > jg short loc_4010DC > mov esi, [esp+80h+var_70] > test esi, esi > js short loc_4010CE > xor edx, edx > mov [esp+80h+var_78], eax > xor ebx, ebx > mov eax, [esp+80h+var_74] > > loc_4010A9: ; CODE XREF: _main+C8 j > mov ecx, [eax+ebx*4-8] > inc ecx > cmp ecx, edi > jl loc_40117A > inc edx > lea esi, [ebx+edi-3] > mov dword ptr [eax+ebx*4-8], 0 > dec ebx > cmp edx, [esp+80h+var_60] > jb short loc_4010A9 > mov eax, [esp+80h+var_78] > > loc_4010CE: ; CODE XREF: _main+9B j > ; _main+186 j > test esi, esi > jl short loc_401147 > mov dword ptr [eax+edi*4-4], 0 > xor ecx, ecx > > loc_4010DC: ; CODE XREF: _main+93 j > test edi, edi > jle short loc_40113E > mov [esp+80h+var_6C], ecx > xor edx, edx > mov [esp+80h+var_5C], edi > > loc_4010EA: ; CODE XREF: _main+132 j > lea ecx, [edx+1] > mov ebx, ecx > mov esi, ebx > cmp ecx, [esp+80h+var_5C] > jge short loc_401130 > mov edx, [eax+edx*4] > mov edi, 1 > mov [esp+80h+var_64], esi > mov [esp+80h+var_68], ecx > > loc_401107: ; CODE XREF: _main+122 j > mov esi, [eax+ebx*4] > cmp edx, esi > jz short loc_40118B > sub esi, edx > mov ecx, esi > neg ecx > cmp edi, ecx > jz short loc_40118B > cmp esi, edi > jz short loc_40118B > inc ebx > inc edi > cmp ebx, [esp+80h+var_5C] > jl short loc_401107 > mov ecx, [esp+80h+var_68] > mov esi, [esp+80h+var_64] > cmp ecx, [esp+80h+var_5C] > > loc_401130: ; CODE XREF: _main+F5 j > mov edx, esi > jl short loc_4010EA > xchg ax, ax > mov ecx, [esp+80h+var_6C] > mov edi, [esp+80h+var_5C] > > loc_40113E: ; CODE XREF: _main+DE j > inc [esp+80h+var_7C] > jmp loc_401087 > ; --------------------------------------------------------------------------- > > loc_401147: ; CODE XREF: _main+D0 j > mov ebx, [esp+80h+var_7C] > mov esi, [esp+80h+var_80] > push eax ; void * > call _free > add esp, 4 > call ds:GetTickCount > sub eax, esi > push eax > push ebx > push offset aDSolutionsInDM ; "%d solutions in %d msecs.\n" > call _printf > xor eax, eax > add esp, 80h > pop ebx > pop edi > pop esi > mov esp, ebp > pop ebp > retn > ; --------------------------------------------------------------------------- > > loc_40117A: ; CODE XREF: _main+B0 j > mov edx, [esp+80h+var_74] > mov eax, [esp+80h+var_78] > mov [edx+ebx*4-8], ecx > jmp loc_4010CE > ; --------------------------------------------------------------------------- > > loc_40118B: ; CODE XREF: _main+10C j > ; _main+116 j ... > mov ecx, [esp+80h+var_6C] > mov edi, [esp+80h+var_5C] > jmp loc_401087 > _main endp > ``` > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Jack Howarth
2015-Feb-14 16:13 UTC
[LLVMdev] trunk's optimizer generates slower code than 3.5
The regressions in the performance of generated code, introduced by the llvm 3.6 release, don't seem to be limited to this 8 queens puzzle" solver test case. See... http://www.phoronix.com/scan.php?page=article&item=llvm-clang-3.5-3.6-rc1&num=1 where a bit hit in the performance of the Sparse Matrix Multiply test of the SciMark v2.0 benchmark was observed as well as others. Do you really want to release 3.6 with this level of performance regression? Jack On Fri, Feb 13, 2015 at 2:47 PM, Jack Howarth <howarth.mailing.lists at gmail.com> wrote:> Also confirmed with the llvm 3.5.1 release and the llvm 3.6 release > branch on x86_64-apple-darwin14... > > % clang-3.5 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > % time ./8 9 > 352 solutions > 3.603u 0.002s 0:03.60 100.0% 0+0k 0+0io 2pf+0w > % time ./8 10 > 724 solutions > 104.217u 0.059s 1:44.30 99.9% 0+0k 0+0io 2pf+0w > > % clang-3.6 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > % time ./8 9 > 352 solutions > 4.050u 0.001s 0:04.05 100.0% 0+0k 0+0io 2pf+0w > % time ./8 10 > 724 solutions > 114.808u 0.041s 1:54.86 99.9% 0+0k 0+0io 2pf+0w > > On Fri, Feb 13, 2015 at 3:37 AM, 191919 <191919 at gmail.com> wrote: >> I submitted the problem report to clang's bugzilla but no one seems to >> care so I have to send it to the mailing list. >> >> clang 3.7 svn (trunk 229055 as the time I was to report this problem) >> generates slower code than 3.5 (Apple LLVM version 6.0 >> (clang-600.0.56) (based on LLVM 3.5svn)) for the following code. >> >> It is a "8 queens puzzle" solver written as an educational example. As >> compiled by both clang 3.5 and 3.7, it gave the correct answer, but >> clang 3.5 generates code which runs 20% faster than 3.6/3.7. >> >> ########################################## >> # clang 3.5 which comes with Xcode 6.1.1 >> ########################################## >> $ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector >> -fno-exceptions -o 8 8.c >> $ time ./8 9 # 9 queens >> 352 solutions >> $ time ./8 10 # 10 queens >> ./8 9 1.63s user 0.00s system 99% cpu 1.632 total >> 724 solutions >> ./8 10 45.11s user 0.01s system 99% cpu 45.121 total >> >> ########################################## >> # clang 3.7 svn trunk >> ########################################## >> $ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector >> -fno-exceptions -o 8 8.c >> $ time ./8 9 # 9 queens >> 352 solutions >> ./8 9 2.07s user 0.00s system 99% cpu 2.078 total >> $ time ./8 10 # 10 queens >> 724 solutions >> ./8 10 56.63s user 0.02s system 99% cpu 56.650 total >> >> The source code is below, I also attached the executable files as well >> as the assembly code files for clang 3.5 and 3.6 by IDA. >> >> The performance is even worse when compiling as 32-bit code while >> gcc-4.9.2 is not affected. >> >> ########## clang-3.5 >> $ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector >> -fno-exceptions -o 8 8.c >> $ time ./8 9 >> 352 solutions >> ./8 9 1.95s user 0.00s system 99% cpu 1.950 total >> >> ########## clang-3.7 >> $ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector >> -fno-exceptions -o 8 8.c >> $ time ./8 9 >> 352 solutions >> ./8 9 2.48s user 0.00s system 99% cpu 2.480 total >> >> ######### gcc-4.9.2 >> $ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector >> -fno-exceptions -o 8 8.c >> $ time ./8 9 >> 352 solutions >> ./8 9 1.44s user 0.00s system 99% cpu 1.442 total >> >> >> ``` >> #include <stdio.h> >> #include <stdlib.h> >> >> static inline int validate(int* a, int d) >> { >> int i, j, x; >> for (i = 0; i < d; ++i) >> { >> for (j = i+1, x = 1; j < d; ++j, ++x) >> { >> const int d = a[i] - a[j]; >> if (d == 0 || d == -x || d == x) return 0; >> } >> } >> return 1; >> } >> >> static inline int solve(int d) >> { >> int r = 0; >> int* a = (int*) calloc(sizeof(int), d+1); >> int p = d - 1; >> >> for (;;) >> { >> a[p]++; >> >> if (a[p] > d-1) >> { >> int bp = p - 1; >> while (bp >= 0) >> { >> a[bp]++; >> if (a[bp] <= d-1) break; >> a[bp] = 0; >> --bp; >> } >> if (bp < 0) >> break; >> a[p] = 0; >> } >> if (validate(a, d)) >> { >> ++r; >> } >> } >> >> free(a); >> return r; >> } >> >> int main(int argc, char** argv) >> { >> if (argc != 2) return -1; >> int r = solve((int) strtol(argv[1], NULL, 10)); >> printf("%d solutions\n", r); >> } >> ``` >> >> clang 3.5's result: >> >> ``` >> public _main >> _main proc near >> >> var_48 = qword ptr -48h >> var_40 = qword ptr -40h >> var_34 = dword ptr -34h >> >> push rbp >> push r15 >> push r14 >> push r13 >> push r12 >> push rbx >> sub rsp, 18h >> mov ebx, 0FFFFFFFFh >> cmp edi, 2 >> jnz loc_100000F29 >> mov rdi, [rsi+8] ; char * >> xor r14d, r14d >> xor esi, esi ; char ** >> mov edx, 0Ah ; int >> call _strtol >> mov r15, rax >> shl rax, 20h >> mov rsi, offset __mh_execute_header >> add rsi, rax >> sar rsi, 20h ; size_t >> mov edi, 4 ; size_t >> call _calloc >> lea edx, [r15-1] >> movsxd r8, edx >> mov ecx, r15d >> add ecx, 0FFFFFFFEh >> js loc_100000DFA >> test r15d, r15d >> mov r11d, [rax+r8*4] >> jle loc_100000EAE >> mov ecx, r15d >> add ecx, 0FFFFFFFEh >> mov [rsp+48h+var_34], ecx >> movsxd rcx, ecx >> lea rcx, [rax+rcx*4] >> mov [rsp+48h+var_40], rcx >> lea rcx, [rax+4] >> mov [rsp+48h+var_48], rcx >> xor r14d, r14d >> jmp short loc_100000D33 >> ; --------------------------------------------------------------------------- >> align 10h >> >> loc_100000D30: ; CODE XREF: _main+129 j >> ; _main+131 j ... >> add r14d, ebx >> >> loc_100000D33: ; CODE XREF: _main+92 j >> cmp r11d, edx >> lea edi, [r11+1] >> mov [rax+r8*4], edi >> mov rcx, [rsp+48h+var_40] >> mov esi, [rsp+48h+var_34] >> mov r11d, edi >> jl short loc_100000D84 >> nop dword ptr [rax+00h] >> >> loc_100000D50: ; CODE XREF: _main+DA j >> mov edi, [rcx] >> lea ebp, [rdi+1] >> mov [rcx], ebp >> cmp edi, edx >> jl short loc_100000D71 >> mov dword ptr [rcx], 0 >> add rcx, 0FFFFFFFFFFFFFFFCh >> test esi, esi >> lea esi, [rsi-1] >> jg short loc_100000D50 >> jmp loc_100000F0E >> ; --------------------------------------------------------------------------- >> >> loc_100000D71: ; CODE XREF: _main+C9 j >> test esi, esi >> js loc_100000F0E >> mov dword ptr [rax+r8*4], 0 >> xor r11d, r11d >> >> loc_100000D84: ; CODE XREF: _main+BA j >> cmp r15d, 1 >> mov esi, 0 >> mov r9, [rsp+48h+var_48] >> mov r12d, 1 >> jle short loc_100000DF0 >> >> loc_100000D99: ; CODE XREF: _main+15E j >> mov r10d, [rax+rsi*4] >> mov ecx, 0FFFFFFFFh >> mov edi, 1 >> mov r13, r9 >> nop word ptr [rax+rax+00h] >> >> loc_100000DB0: ; CODE XREF: _main+14F j >> xor ebx, ebx >> mov ebp, r10d >> sub ebp, [r13+0] >> jz loc_100000D30 >> cmp ecx, ebp >> jz loc_100000D30 >> cmp edi, ebp >> jz loc_100000D30 >> add r13, 4 >> inc rdi >> dec ecx >> mov ebx, edi >> add ebx, esi >> cmp ebx, r15d >> jl short loc_100000DB0 >> inc r12 >> add r9, 4 >> inc rsi >> cmp r12d, r15d >> jl short loc_100000D99 >> >> loc_100000DF0: ; CODE XREF: _main+107 j >> mov ebx, 1 >> jmp loc_100000D30 >> ; --------------------------------------------------------------------------- >> >> loc_100000DFA: ; CODE XREF: _main+5E j >> mov ecx, [rax+r8*4] >> lea r9d, [rcx+1] >> mov [rax+r8*4], r9d >> cmp ecx, r8d >> jge loc_100000F0E >> lea r12, [rax+4] >> xor r14d, r14d >> db 2Eh >> nop word ptr [rax+rax+00000000h] >> >> loc_100000E20: ; CODE XREF: _main+216 j >> test r15d, r15d >> setle cl >> cmp r15d, 2 >> jl short loc_100000E90 >> test cl, cl >> mov r13d, 0 >> mov r11, r12 >> mov r10d, 1 >> jnz short loc_100000E90 >> >> loc_100000E3F: ; CODE XREF: _main+1F0 j >> mov edi, [rax+r13*4] >> mov edx, 0FFFFFFFFh >> mov ecx, 1 >> mov rsi, r11 >> >> loc_100000E50: ; CODE XREF: _main+1E1 j >> xor ebx, ebx >> mov ebp, edi >> sub ebp, [rsi] >> jz short loc_100000E95 >> cmp edx, ebp >> jz short loc_100000E95 >> cmp ecx, ebp >> jz short loc_100000E95 >> add rsi, 4 >> inc rcx >> dec edx >> mov ebx, ecx >> add ebx, r13d >> cmp ebx, r15d >> jl short loc_100000E50 >> inc r10 >> add r11, 4 >> inc r13 >> cmp r10d, r15d >> jl short loc_100000E3F >> db 66h, 66h, 66h, 66h, 2Eh >> nop word ptr [rax+rax+00000000h] >> >> loc_100000E90: ; CODE XREF: _main+19A j >> ; _main+1AD j >> mov ebx, 1 >> >> loc_100000E95: ; CODE XREF: _main+1C6 j >> ; _main+1CA j ... >> add r14d, ebx >> cmp r9d, r8d >> lea ecx, [r9+1] >> mov [rax+r8*4], ecx >> mov r9d, ecx >> jl loc_100000E20 >> jmp short loc_100000F0E >> ; --------------------------------------------------------------------------- >> >> loc_100000EAE: ; CODE XREF: _main+6B j >> add r15d, 0FFFFFFFEh >> movsxd rcx, r15d >> lea rcx, [rax+rcx*4] >> xor r14d, r14d >> jmp short loc_100000EC6 >> ; --------------------------------------------------------------------------- >> align 20h >> >> loc_100000EC0: ; CODE XREF: _main+247 j >> ; _main+27C j >> inc r14d >> mov r11d, ebp >> >> loc_100000EC6: ; CODE XREF: _main+22C j >> lea ebp, [r11+1] >> mov [rax+r8*4], ebp >> cmp r11d, r8d >> mov rsi, rcx >> mov edi, r15d >> jl short loc_100000EC0 >> nop dword ptr [rax+00000000h] >> >> loc_100000EE0: ; CODE XREF: _main+26A j >> mov ebp, [rsi] >> lea ebx, [rbp+1] >> mov [rsi], ebx >> cmp ebp, edx >> jl short loc_100000EFE >> mov dword ptr [rsi], 0 >> add rsi, 0FFFFFFFFFFFFFFFCh >> test edi, edi >> lea edi, [rdi-1] >> jg short loc_100000EE0 >> jmp short loc_100000F0E >> ; --------------------------------------------------------------------------- >> >> loc_100000EFE: ; CODE XREF: _main+259 j >> test edi, edi >> js short loc_100000F0E >> mov dword ptr [rax+r8*4], 0 >> xor ebp, ebp >> jmp short loc_100000EC0 >> ; --------------------------------------------------------------------------- >> >> loc_100000F0E: ; CODE XREF: _main+DC j >> ; _main+E3 j ... >> mov rdi, rax ; void * >> call _free >> lea rdi, aDSolutions ; "%d solutions\n" >> xor ebx, ebx >> xor eax, eax >> mov esi, r14d >> call _printf >> >> loc_100000F29: ; CODE XREF: _main+16 j >> mov eax, ebx >> add rsp, 18h >> pop rbx >> pop r12 >> pop r13 >> pop r14 >> pop r15 >> pop rbp >> retn >> _main endp >> ``` >> >> clang 3.6's result: >> >> ``` >> public _main >> _main proc near >> >> var_60 = qword ptr -60h >> var_58 = qword ptr -58h >> var_50 = qword ptr -50h >> var_48 = qword ptr -48h >> var_40 = qword ptr -40h >> var_38 = qword ptr -38h >> >> push rbp >> push r15 >> push r14 >> push r13 >> push r12 >> push rbx >> sub rsp, 38h >> mov ebx, 0FFFFFFFFh >> cmp edi, 2 >> jnz loc_100000F23 >> mov rbx, offset __mh_execute_header >> mov rdi, [rsi+8] ; char * >> xor r13d, r13d >> xor esi, esi ; char ** >> mov edx, 0Ah ; int >> call _strtol >> mov r14, rax >> shl rax, 20h >> mov [rsp+68h+var_38], rax >> lea rsi, [rax+rbx] >> sar rsi, 20h ; size_t >> mov edi, 4 ; size_t >> call _calloc >> lea r11d, [r14-1] >> movsxd r12, r11d >> mov [rsp+68h+var_40], r12 >> movsxd rcx, r14d >> mov [rsp+68h+var_50], rcx >> add ecx, 0FFFFFFFEh >> js loc_100000E1A >> mov ecx, r14d >> add ecx, 0FFFFFFFEh >> movsxd rcx, ecx >> inc rcx >> mov [rsp+68h+var_58], rcx >> mov rcx, rax >> add rcx, 4 >> mov [rsp+68h+var_60], rcx >> xor ebp, ebp >> jmp short loc_100000D17 >> ; --------------------------------------------------------------------------- >> align 10h >> >> loc_100000D10: ; CODE XREF: _main+15B j >> ; _main+163 j ... >> mov rbp, [rsp+68h+var_48] >> add ebp, edi >> >> loc_100000D17: ; CODE XREF: _main+93 j >> cmp r13d, r11d >> lea edx, [r13+1] >> mov [rax+r12*4], edx >> mov rcx, [rsp+68h+var_58] >> mov r13d, edx >> jl short loc_100000D6B >> nop dword ptr [rax+00h] >> >> loc_100000D30: ; CODE XREF: _main+DE j >> mov edx, [rax+rcx*4-4] >> lea esi, [rdx+1] >> mov [rax+rcx*4-4], esi >> cmp edx, r11d >> jl short loc_100000D60 >> mov dword ptr [rax+rcx*4-4], 0 >> dec rcx >> test rcx, rcx >> jg short loc_100000D30 >> jmp loc_100000F09 >> ; --------------------------------------------------------------------------- >> align 20h >> >> loc_100000D60: ; CODE XREF: _main+CE j >> mov dword ptr [rax+r12*4], 0 >> xor r13d, r13d >> >> loc_100000D6B: ; CODE XREF: _main+BA j >> mov [rsp+68h+var_48], rbp >> test r14d, r14d >> setle cl >> mov rdx, offset __mh_execute_header >> lea rdx, [rdx+1] >> cmp [rsp+68h+var_38], rdx >> jl loc_100000E10 >> test cl, cl >> mov edx, 0 >> mov r10, [rsp+68h+var_60] >> mov r9d, 1 >> jnz short loc_100000E10 >> >> loc_100000DA3: ; CODE XREF: _main+195 j >> mov esi, [rax+rdx*4] >> mov r15d, 0FFFFFFFFh >> mov r8d, 1 >> mov rcx, r10 >> db 66h, 66h, 2Eh >> nop dword ptr [rax+rax+00000000h] >> >> loc_100000DC0: ; CODE XREF: _main+184 j >> mov ebx, [rcx] >> mov ebp, esi >> sub ebp, ebx >> xor edi, edi >> cmp r8d, ebp >> jz loc_100000D10 >> cmp esi, ebx >> jz loc_100000D10 >> cmp r15d, ebp >> jz loc_100000D10 >> add rcx, 4 >> inc r8 >> dec r15d >> mov edi, r8d >> add edi, edx >> cmp edi, r14d >> jl short loc_100000DC0 >> inc r9 >> add r10, 4 >> inc rdx >> cmp r9, [rsp+68h+var_50] >> jl short loc_100000DA3 >> nop word ptr [rax+rax+00000000h] >> >> loc_100000E10: ; CODE XREF: _main+119 j >> ; _main+131 j >> mov edi, 1 >> jmp loc_100000D10 >> ; --------------------------------------------------------------------------- >> >> loc_100000E1A: ; CODE XREF: _main+6E j >> test r14d, r14d >> jle loc_100000F00 >> mov dword ptr [rax+r12*4], 1 >> xor ebp, ebp >> cmp r14d, 2 >> jl loc_100000F09 >> mov rcx, rax >> add rcx, 4 >> mov [rsp+68h+var_48], rcx >> xor ebp, ebp >> mov r15d, 1 >> nop dword ptr [rax+rax+00h] >> >> loc_100000E50: ; CODE XREF: _main+288 j >> mov rbx, rbp >> mov rcx, offset __mh_execute_header >> cmp [rsp+68h+var_38], rcx >> mov edx, 0 >> mov r13, [rsp+68h+var_48] >> mov r8d, 1 >> mov r9d, 1 >> jle short loc_100000EE0 >> >> loc_100000E7A: ; CODE XREF: _main+25A j >> mov r12d, [rax+rdx*4] >> mov edi, 0FFFFFFFFh >> mov ecx, 1 >> mov rsi, r13 >> nop dword ptr [rax+rax+00h] >> >> loc_100000E90: ; CODE XREF: _main+249 j >> mov r10d, [rsi] >> mov ebp, r12d >> sub ebp, r10d >> xor r9d, r9d >> cmp ecx, ebp >> jz short loc_100000EE0 >> cmp r12d, r10d >> jz short loc_100000EE0 >> cmp edi, ebp >> jz short loc_100000EE0 >> add rsi, 4 >> inc rcx >> dec edi >> mov ebp, ecx >> add ebp, edx >> cmp ebp, r14d >> jl short loc_100000E90 >> inc r8 >> add r13, 4 >> inc rdx >> cmp r8, [rsp+68h+var_50] >> jl short loc_100000E7A >> mov r9d, 1 >> db 66h, 66h, 66h, 66h, 2Eh >> nop word ptr [rax+rax+00000000h] >> >> loc_100000EE0: ; CODE XREF: _main+208 j >> ; _main+22E j ... >> mov rbp, rbx >> add ebp, r9d >> cmp r15d, r11d >> lea ecx, [r15+1] >> mov rdx, [rsp+68h+var_40] >> mov [rax+rdx*4], ecx >> mov r15d, ecx >> jl loc_100000E50 >> jmp short loc_100000F09 >> ; --------------------------------------------------------------------------- >> >> loc_100000F00: ; CODE XREF: _main+1AD j >> xor ebp, ebp >> test r11d, r11d >> cmovns ebp, r11d >> >> loc_100000F09: ; CODE XREF: _main+E0 j >> ; _main+1C1 j ... >> mov rdi, rax ; void * >> call _free >> lea rdi, aDSolutions ; "%d solutions\n" >> xor ebx, ebx >> xor eax, eax >> mov esi, ebp >> call _printf >> >> loc_100000F23: ; CODE XREF: _main+16 j >> mov eax, ebx >> add rsp, 38h >> pop rbx >> pop r12 >> pop r13 >> pop r14 >> pop r15 >> pop rbp >> retn >> _main endp >> ``` >> >> gcc-4.9.2's result: >> ``` >> >> _main proc near >> >> var_48 = qword ptr -48h >> var_40 = dword ptr -40h >> var_3C = dword ptr -3Ch >> >> cmp edi, 2 >> jz short loc_100000D69 >> or eax, 0FFFFFFFFh >> retn >> ; --------------------------------------------------------------------------- >> >> loc_100000D69: ; CODE XREF: _main+3 j >> push r15 >> mov edx, 0Ah ; int >> push r14 >> push r13 >> push r12 >> push rbp >> push rbx >> sub rsp, 18h >> mov rdi, [rsi+8] ; char * >> xor esi, esi ; char ** >> call _strtol >> mov edi, 4 ; size_t >> lea esi, [rax+1] >> mov r14, rax >> mov ebx, eax >> lea r15d, [r14-2] >> movsxd rsi, esi ; size_t >> call _calloc >> mov [rsp+48h+var_3C], 0 >> mov rdi, rax ; void * >> lea eax, [r14-1] >> cdqe >> lea r13, [rdi+rax*4] >> movsxd rax, r15d >> mov ebp, [r13+0] >> shl rax, 2 >> lea r12, [rdi+rax] >> lea rax, [rdi+rax-4] >> mov [rsp+48h+var_48], rax >> mov eax, r14d >> lea r14d, [r14+1] >> nop word ptr [rax+rax+00h] >> nop word ptr [rax+rax+00h] >> >> loc_100000DE0: ; CODE XREF: _main+12B j >> ; _main+155 j ... >> add ebp, 1 >> cmp ebx, ebp >> mov [r13+0], ebp >> jg short loc_100000E62 >> test r15d, r15d >> js short loc_100000E33 >> mov ecx, [r12] >> lea edx, [rcx+1] >> cmp ebx, edx >> mov [r12], edx >> jg short loc_100000E58 >> mov r8, r12 >> mov rcx, [rsp+48h+var_48] >> mov esi, r15d >> jmp short loc_100000E24 >> ; --------------------------------------------------------------------------- >> align 10h >> >> loc_100000E10: ; CODE XREF: _main+D1 j >> mov edx, [rcx] >> sub r8, 4 >> sub rcx, 4 >> add edx, 1 >> mov [rcx+4], edx >> cmp ebx, edx >> jg short loc_100000E58 >> >> loc_100000E24: ; CODE XREF: _main+A9 j >> sub esi, 1 >> mov dword ptr [r8], 0 >> cmp esi, 0FFFFFFFFh >> jnz short loc_100000E10 >> >> loc_100000E33: ; CODE XREF: _main+8E j >> call _free >> mov esi, [rsp+48h+var_3C] >> add rsp, 18h >> xor eax, eax >> pop rbx >> lea rdi, aDSolutions ; "%d solutions\n" >> pop rbp >> pop r12 >> pop r13 >> pop r14 >> pop r15 >> jmp _printf >> ; --------------------------------------------------------------------------- >> >> loc_100000E58: ; CODE XREF: _main+9D j >> ; _main+C2 j >> mov dword ptr [r13+0], 0 >> xor ebp, ebp >> >> loc_100000E62: ; CODE XREF: _main+89 j >> test ebx, ebx >> jle loc_100000EE6 >> lea r11, [rdi+8] >> xor r10d, r10d >> >> loc_100000E71: ; CODE XREF: _main+184 j >> add r10d, 1 >> cmp r10d, eax >> jz short loc_100000EE6 >> mov r8d, [r11-8] >> mov edx, r8d >> sub edx, [r11-4] >> add edx, 1 >> cmp edx, 2 >> jbe loc_100000DE0 >> mov r9d, r14d >> mov rcx, r11 >> mov edx, 1 >> mov [rsp+48h+var_40], r10d >> sub r9d, r10d >> jmp short loc_100000ED3 >> ; --------------------------------------------------------------------------- >> align 10h >> >> loc_100000EB0: ; CODE XREF: _main+179 j >> mov esi, r8d >> sub esi, [rcx] >> jz loc_100000DE0 >> mov r10d, esi >> add rcx, 4 >> add r10d, edx >> jz loc_100000DE0 >> cmp esi, edx >> jz loc_100000DE0 >> >> loc_100000ED3: ; CODE XREF: _main+144 j >> add edx, 1 >> cmp edx, r9d >> jnz short loc_100000EB0 >> mov r10d, [rsp+48h+var_40] >> add r11, 4 >> jmp short loc_100000E71 >> ; --------------------------------------------------------------------------- >> >> loc_100000EE6: ; CODE XREF: _main+104 j >> ; _main+118 j >> add [rsp+48h+var_3C], 1 >> jmp loc_100000DE0 >> _main endp >> ``` >> >> MSVC 10.0's result: >> >> ``` >> >> _main proc near ; CODE XREF: ___tmainCRTStartup+106 p >> >> var_80 = dword ptr -80h >> var_7C = dword ptr -7Ch >> var_78 = dword ptr -78h >> var_74 = dword ptr -74h >> var_70 = dword ptr -70h >> var_6C = dword ptr -6Ch >> var_68 = dword ptr -68h >> var_64 = dword ptr -64h >> var_60 = dword ptr -60h >> var_5C = dword ptr -5Ch >> argc = dword ptr 8 >> argv = dword ptr 0Ch >> envp = dword ptr 10h >> >> push ebp >> mov ebp, esp >> and esp, 0FFFFFF80h >> push esi >> push edi >> push ebx >> sub esp, 74h >> push 3 >> call sub_4080F0 >> add esp, 4 >> stmxcsr [esp+80h+var_80] >> or [esp+80h+var_80], 8000h >> ldmxcsr [esp+80h+var_80] >> cmp [ebp+argc], 2 >> jz short loc_40103A >> mov eax, 0FFFFFFFFh >> add esp, 74h >> pop ebx >> pop edi >> pop esi >> mov esp, ebp >> pop ebp >> retn >> ; --------------------------------------------------------------------------- >> >> loc_40103A: ; CODE XREF: _main+29 j >> call ds:GetTickCount >> mov esi, eax >> mov eax, [ebp+argv] >> push dword ptr [eax+4] ; char * >> call _atoi >> mov edi, eax >> lea eax, [edi+1] >> push eax ; size_t >> push 4 ; size_t >> call _calloc >> add esp, 0Ch >> mov ecx, [eax+edi*4-4] >> lea edx, [edi-1] >> mov [esp+80h+var_6C], ecx >> xor ebx, ebx >> mov [esp+80h+var_7C], ebx >> lea ecx, [eax+edi*4] >> mov [esp+80h+var_74], ecx >> lea ecx, [edi-2] >> mov [esp+80h+var_70], ecx >> mov [esp+80h+var_60], edx >> mov [esp+80h+var_80], esi >> mov ecx, [esp+80h+var_6C] >> >> loc_401087: ; CODE XREF: _main+142 j >> ; _main+193 j >> mov edx, [esp+80h+var_60] >> inc ecx >> mov [eax+edi*4-4], ecx >> cmp edi, [eax+edx*4] >> jg short loc_4010DC >> mov esi, [esp+80h+var_70] >> test esi, esi >> js short loc_4010CE >> xor edx, edx >> mov [esp+80h+var_78], eax >> xor ebx, ebx >> mov eax, [esp+80h+var_74] >> >> loc_4010A9: ; CODE XREF: _main+C8 j >> mov ecx, [eax+ebx*4-8] >> inc ecx >> cmp ecx, edi >> jl loc_40117A >> inc edx >> lea esi, [ebx+edi-3] >> mov dword ptr [eax+ebx*4-8], 0 >> dec ebx >> cmp edx, [esp+80h+var_60] >> jb short loc_4010A9 >> mov eax, [esp+80h+var_78] >> >> loc_4010CE: ; CODE XREF: _main+9B j >> ; _main+186 j >> test esi, esi >> jl short loc_401147 >> mov dword ptr [eax+edi*4-4], 0 >> xor ecx, ecx >> >> loc_4010DC: ; CODE XREF: _main+93 j >> test edi, edi >> jle short loc_40113E >> mov [esp+80h+var_6C], ecx >> xor edx, edx >> mov [esp+80h+var_5C], edi >> >> loc_4010EA: ; CODE XREF: _main+132 j >> lea ecx, [edx+1] >> mov ebx, ecx >> mov esi, ebx >> cmp ecx, [esp+80h+var_5C] >> jge short loc_401130 >> mov edx, [eax+edx*4] >> mov edi, 1 >> mov [esp+80h+var_64], esi >> mov [esp+80h+var_68], ecx >> >> loc_401107: ; CODE XREF: _main+122 j >> mov esi, [eax+ebx*4] >> cmp edx, esi >> jz short loc_40118B >> sub esi, edx >> mov ecx, esi >> neg ecx >> cmp edi, ecx >> jz short loc_40118B >> cmp esi, edi >> jz short loc_40118B >> inc ebx >> inc edi >> cmp ebx, [esp+80h+var_5C] >> jl short loc_401107 >> mov ecx, [esp+80h+var_68] >> mov esi, [esp+80h+var_64] >> cmp ecx, [esp+80h+var_5C] >> >> loc_401130: ; CODE XREF: _main+F5 j >> mov edx, esi >> jl short loc_4010EA >> xchg ax, ax >> mov ecx, [esp+80h+var_6C] >> mov edi, [esp+80h+var_5C] >> >> loc_40113E: ; CODE XREF: _main+DE j >> inc [esp+80h+var_7C] >> jmp loc_401087 >> ; --------------------------------------------------------------------------- >> >> loc_401147: ; CODE XREF: _main+D0 j >> mov ebx, [esp+80h+var_7C] >> mov esi, [esp+80h+var_80] >> push eax ; void * >> call _free >> add esp, 4 >> call ds:GetTickCount >> sub eax, esi >> push eax >> push ebx >> push offset aDSolutionsInDM ; "%d solutions in %d msecs.\n" >> call _printf >> xor eax, eax >> add esp, 80h >> pop ebx >> pop edi >> pop esi >> mov esp, ebp >> pop ebp >> retn >> ; --------------------------------------------------------------------------- >> >> loc_40117A: ; CODE XREF: _main+B0 j >> mov edx, [esp+80h+var_74] >> mov eax, [esp+80h+var_78] >> mov [edx+ebx*4-8], ecx >> jmp loc_4010CE >> ; --------------------------------------------------------------------------- >> >> loc_40118B: ; CODE XREF: _main+10C j >> ; _main+116 j ... >> mov ecx, [esp+80h+var_6C] >> mov edi, [esp+80h+var_5C] >> jmp loc_401087 >> _main endp >> ``` >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Seemingly Similar Threads
- [LLVMdev] trunk's optimizer generates slower code than 3.5
- [LLVMdev] trunk's optimizer generates slower code than 3.5
- [LLVMdev] bug in X86 disasm code?
- Tail call optimization is getting affected due to local function related optimization with IPRA
- Tail call optimization is getting affected due to local function related optimization with IPRA