I submitted the problem report to clang's bugzilla but no one seems to
care so I have to send it to the mailing list.
clang 3.7 svn (trunk 229055 as the time I was to report this problem)
generates slower code than 3.5 (Apple LLVM version 6.0
(clang-600.0.56) (based on LLVM 3.5svn)) for the following code.
It is a "8 queens puzzle" solver written as an educational example. As
compiled by both clang 3.5 and 3.7, it gave the correct answer, but
clang 3.5 generates code which runs 20% faster than 3.6/3.7.
##########################################
# clang 3.5 which comes with Xcode 6.1.1
##########################################
$ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
-fno-exceptions -o 8 8.c
$ time ./8 9 # 9 queens
352 solutions
$ time ./8 10 # 10 queens
./8 9 1.63s user 0.00s system 99% cpu 1.632 total
724 solutions
./8 10 45.11s user 0.01s system 99% cpu 45.121 total
##########################################
# clang 3.7 svn trunk
##########################################
$ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
-fno-exceptions -o 8 8.c
$ time ./8 9 # 9 queens
352 solutions
./8 9 2.07s user 0.00s system 99% cpu 2.078 total
$ time ./8 10 # 10 queens
724 solutions
./8 10 56.63s user 0.02s system 99% cpu 56.650 total
The source code is below, I also attached the executable files as well
as the assembly code files for clang 3.5 and 3.6 by IDA.
The performance is even worse when compiling as 32-bit code while
gcc-4.9.2 is not affected.
########## clang-3.5
$ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector
-fno-exceptions -o 8 8.c
$ time ./8 9
352 solutions
./8 9 1.95s user 0.00s system 99% cpu 1.950 total
########## clang-3.7
$ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector
-fno-exceptions -o 8 8.c
$ time ./8 9
352 solutions
./8 9 2.48s user 0.00s system 99% cpu 2.480 total
######### gcc-4.9.2
$ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector
-fno-exceptions -o 8 8.c
$ time ./8 9
352 solutions
./8 9 1.44s user 0.00s system 99% cpu 1.442 total
```
#include <stdio.h>
#include <stdlib.h>
static inline int validate(int* a, int d)
{
int i, j, x;
for (i = 0; i < d; ++i)
{
for (j = i+1, x = 1; j < d; ++j, ++x)
{
const int d = a[i] - a[j];
if (d == 0 || d == -x || d == x) return 0;
}
}
return 1;
}
static inline int solve(int d)
{
int r = 0;
int* a = (int*) calloc(sizeof(int), d+1);
int p = d - 1;
for (;;)
{
a[p]++;
if (a[p] > d-1)
{
int bp = p - 1;
while (bp >= 0)
{
a[bp]++;
if (a[bp] <= d-1) break;
a[bp] = 0;
--bp;
}
if (bp < 0)
break;
a[p] = 0;
}
if (validate(a, d))
{
++r;
}
}
free(a);
return r;
}
int main(int argc, char** argv)
{
if (argc != 2) return -1;
int r = solve((int) strtol(argv[1], NULL, 10));
printf("%d solutions\n", r);
}
```
clang 3.5's result:
```
public _main
_main proc near
var_48 = qword ptr -48h
var_40 = qword ptr -40h
var_34 = dword ptr -34h
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 18h
mov ebx, 0FFFFFFFFh
cmp edi, 2
jnz loc_100000F29
mov rdi, [rsi+8] ; char *
xor r14d, r14d
xor esi, esi ; char **
mov edx, 0Ah ; int
call _strtol
mov r15, rax
shl rax, 20h
mov rsi, offset __mh_execute_header
add rsi, rax
sar rsi, 20h ; size_t
mov edi, 4 ; size_t
call _calloc
lea edx, [r15-1]
movsxd r8, edx
mov ecx, r15d
add ecx, 0FFFFFFFEh
js loc_100000DFA
test r15d, r15d
mov r11d, [rax+r8*4]
jle loc_100000EAE
mov ecx, r15d
add ecx, 0FFFFFFFEh
mov [rsp+48h+var_34], ecx
movsxd rcx, ecx
lea rcx, [rax+rcx*4]
mov [rsp+48h+var_40], rcx
lea rcx, [rax+4]
mov [rsp+48h+var_48], rcx
xor r14d, r14d
jmp short loc_100000D33
; ---------------------------------------------------------------------------
align 10h
loc_100000D30: ; CODE XREF: _main+129 j
; _main+131 j ...
add r14d, ebx
loc_100000D33: ; CODE XREF: _main+92 j
cmp r11d, edx
lea edi, [r11+1]
mov [rax+r8*4], edi
mov rcx, [rsp+48h+var_40]
mov esi, [rsp+48h+var_34]
mov r11d, edi
jl short loc_100000D84
nop dword ptr [rax+00h]
loc_100000D50: ; CODE XREF: _main+DA j
mov edi, [rcx]
lea ebp, [rdi+1]
mov [rcx], ebp
cmp edi, edx
jl short loc_100000D71
mov dword ptr [rcx], 0
add rcx, 0FFFFFFFFFFFFFFFCh
test esi, esi
lea esi, [rsi-1]
jg short loc_100000D50
jmp loc_100000F0E
; ---------------------------------------------------------------------------
loc_100000D71: ; CODE XREF: _main+C9 j
test esi, esi
js loc_100000F0E
mov dword ptr [rax+r8*4], 0
xor r11d, r11d
loc_100000D84: ; CODE XREF: _main+BA j
cmp r15d, 1
mov esi, 0
mov r9, [rsp+48h+var_48]
mov r12d, 1
jle short loc_100000DF0
loc_100000D99: ; CODE XREF: _main+15E j
mov r10d, [rax+rsi*4]
mov ecx, 0FFFFFFFFh
mov edi, 1
mov r13, r9
nop word ptr [rax+rax+00h]
loc_100000DB0: ; CODE XREF: _main+14F j
xor ebx, ebx
mov ebp, r10d
sub ebp, [r13+0]
jz loc_100000D30
cmp ecx, ebp
jz loc_100000D30
cmp edi, ebp
jz loc_100000D30
add r13, 4
inc rdi
dec ecx
mov ebx, edi
add ebx, esi
cmp ebx, r15d
jl short loc_100000DB0
inc r12
add r9, 4
inc rsi
cmp r12d, r15d
jl short loc_100000D99
loc_100000DF0: ; CODE XREF: _main+107 j
mov ebx, 1
jmp loc_100000D30
; ---------------------------------------------------------------------------
loc_100000DFA: ; CODE XREF: _main+5E j
mov ecx, [rax+r8*4]
lea r9d, [rcx+1]
mov [rax+r8*4], r9d
cmp ecx, r8d
jge loc_100000F0E
lea r12, [rax+4]
xor r14d, r14d
db 2Eh
nop word ptr [rax+rax+00000000h]
loc_100000E20: ; CODE XREF: _main+216 j
test r15d, r15d
setle cl
cmp r15d, 2
jl short loc_100000E90
test cl, cl
mov r13d, 0
mov r11, r12
mov r10d, 1
jnz short loc_100000E90
loc_100000E3F: ; CODE XREF: _main+1F0 j
mov edi, [rax+r13*4]
mov edx, 0FFFFFFFFh
mov ecx, 1
mov rsi, r11
loc_100000E50: ; CODE XREF: _main+1E1 j
xor ebx, ebx
mov ebp, edi
sub ebp, [rsi]
jz short loc_100000E95
cmp edx, ebp
jz short loc_100000E95
cmp ecx, ebp
jz short loc_100000E95
add rsi, 4
inc rcx
dec edx
mov ebx, ecx
add ebx, r13d
cmp ebx, r15d
jl short loc_100000E50
inc r10
add r11, 4
inc r13
cmp r10d, r15d
jl short loc_100000E3F
db 66h, 66h, 66h, 66h, 2Eh
nop word ptr [rax+rax+00000000h]
loc_100000E90: ; CODE XREF: _main+19A j
; _main+1AD j
mov ebx, 1
loc_100000E95: ; CODE XREF: _main+1C6 j
; _main+1CA j ...
add r14d, ebx
cmp r9d, r8d
lea ecx, [r9+1]
mov [rax+r8*4], ecx
mov r9d, ecx
jl loc_100000E20
jmp short loc_100000F0E
; ---------------------------------------------------------------------------
loc_100000EAE: ; CODE XREF: _main+6B j
add r15d, 0FFFFFFFEh
movsxd rcx, r15d
lea rcx, [rax+rcx*4]
xor r14d, r14d
jmp short loc_100000EC6
; ---------------------------------------------------------------------------
align 20h
loc_100000EC0: ; CODE XREF: _main+247 j
; _main+27C j
inc r14d
mov r11d, ebp
loc_100000EC6: ; CODE XREF: _main+22C j
lea ebp, [r11+1]
mov [rax+r8*4], ebp
cmp r11d, r8d
mov rsi, rcx
mov edi, r15d
jl short loc_100000EC0
nop dword ptr [rax+00000000h]
loc_100000EE0: ; CODE XREF: _main+26A j
mov ebp, [rsi]
lea ebx, [rbp+1]
mov [rsi], ebx
cmp ebp, edx
jl short loc_100000EFE
mov dword ptr [rsi], 0
add rsi, 0FFFFFFFFFFFFFFFCh
test edi, edi
lea edi, [rdi-1]
jg short loc_100000EE0
jmp short loc_100000F0E
; ---------------------------------------------------------------------------
loc_100000EFE: ; CODE XREF: _main+259 j
test edi, edi
js short loc_100000F0E
mov dword ptr [rax+r8*4], 0
xor ebp, ebp
jmp short loc_100000EC0
; ---------------------------------------------------------------------------
loc_100000F0E: ; CODE XREF: _main+DC j
; _main+E3 j ...
mov rdi, rax ; void *
call _free
lea rdi, aDSolutions ; "%d solutions\n"
xor ebx, ebx
xor eax, eax
mov esi, r14d
call _printf
loc_100000F29: ; CODE XREF: _main+16 j
mov eax, ebx
add rsp, 18h
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
retn
_main endp
```
clang 3.6's result:
```
public _main
_main proc near
var_60 = qword ptr -60h
var_58 = qword ptr -58h
var_50 = qword ptr -50h
var_48 = qword ptr -48h
var_40 = qword ptr -40h
var_38 = qword ptr -38h
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 38h
mov ebx, 0FFFFFFFFh
cmp edi, 2
jnz loc_100000F23
mov rbx, offset __mh_execute_header
mov rdi, [rsi+8] ; char *
xor r13d, r13d
xor esi, esi ; char **
mov edx, 0Ah ; int
call _strtol
mov r14, rax
shl rax, 20h
mov [rsp+68h+var_38], rax
lea rsi, [rax+rbx]
sar rsi, 20h ; size_t
mov edi, 4 ; size_t
call _calloc
lea r11d, [r14-1]
movsxd r12, r11d
mov [rsp+68h+var_40], r12
movsxd rcx, r14d
mov [rsp+68h+var_50], rcx
add ecx, 0FFFFFFFEh
js loc_100000E1A
mov ecx, r14d
add ecx, 0FFFFFFFEh
movsxd rcx, ecx
inc rcx
mov [rsp+68h+var_58], rcx
mov rcx, rax
add rcx, 4
mov [rsp+68h+var_60], rcx
xor ebp, ebp
jmp short loc_100000D17
; ---------------------------------------------------------------------------
align 10h
loc_100000D10: ; CODE XREF: _main+15B j
; _main+163 j ...
mov rbp, [rsp+68h+var_48]
add ebp, edi
loc_100000D17: ; CODE XREF: _main+93 j
cmp r13d, r11d
lea edx, [r13+1]
mov [rax+r12*4], edx
mov rcx, [rsp+68h+var_58]
mov r13d, edx
jl short loc_100000D6B
nop dword ptr [rax+00h]
loc_100000D30: ; CODE XREF: _main+DE j
mov edx, [rax+rcx*4-4]
lea esi, [rdx+1]
mov [rax+rcx*4-4], esi
cmp edx, r11d
jl short loc_100000D60
mov dword ptr [rax+rcx*4-4], 0
dec rcx
test rcx, rcx
jg short loc_100000D30
jmp loc_100000F09
; ---------------------------------------------------------------------------
align 20h
loc_100000D60: ; CODE XREF: _main+CE j
mov dword ptr [rax+r12*4], 0
xor r13d, r13d
loc_100000D6B: ; CODE XREF: _main+BA j
mov [rsp+68h+var_48], rbp
test r14d, r14d
setle cl
mov rdx, offset __mh_execute_header
lea rdx, [rdx+1]
cmp [rsp+68h+var_38], rdx
jl loc_100000E10
test cl, cl
mov edx, 0
mov r10, [rsp+68h+var_60]
mov r9d, 1
jnz short loc_100000E10
loc_100000DA3: ; CODE XREF: _main+195 j
mov esi, [rax+rdx*4]
mov r15d, 0FFFFFFFFh
mov r8d, 1
mov rcx, r10
db 66h, 66h, 2Eh
nop dword ptr [rax+rax+00000000h]
loc_100000DC0: ; CODE XREF: _main+184 j
mov ebx, [rcx]
mov ebp, esi
sub ebp, ebx
xor edi, edi
cmp r8d, ebp
jz loc_100000D10
cmp esi, ebx
jz loc_100000D10
cmp r15d, ebp
jz loc_100000D10
add rcx, 4
inc r8
dec r15d
mov edi, r8d
add edi, edx
cmp edi, r14d
jl short loc_100000DC0
inc r9
add r10, 4
inc rdx
cmp r9, [rsp+68h+var_50]
jl short loc_100000DA3
nop word ptr [rax+rax+00000000h]
loc_100000E10: ; CODE XREF: _main+119 j
; _main+131 j
mov edi, 1
jmp loc_100000D10
; ---------------------------------------------------------------------------
loc_100000E1A: ; CODE XREF: _main+6E j
test r14d, r14d
jle loc_100000F00
mov dword ptr [rax+r12*4], 1
xor ebp, ebp
cmp r14d, 2
jl loc_100000F09
mov rcx, rax
add rcx, 4
mov [rsp+68h+var_48], rcx
xor ebp, ebp
mov r15d, 1
nop dword ptr [rax+rax+00h]
loc_100000E50: ; CODE XREF: _main+288 j
mov rbx, rbp
mov rcx, offset __mh_execute_header
cmp [rsp+68h+var_38], rcx
mov edx, 0
mov r13, [rsp+68h+var_48]
mov r8d, 1
mov r9d, 1
jle short loc_100000EE0
loc_100000E7A: ; CODE XREF: _main+25A j
mov r12d, [rax+rdx*4]
mov edi, 0FFFFFFFFh
mov ecx, 1
mov rsi, r13
nop dword ptr [rax+rax+00h]
loc_100000E90: ; CODE XREF: _main+249 j
mov r10d, [rsi]
mov ebp, r12d
sub ebp, r10d
xor r9d, r9d
cmp ecx, ebp
jz short loc_100000EE0
cmp r12d, r10d
jz short loc_100000EE0
cmp edi, ebp
jz short loc_100000EE0
add rsi, 4
inc rcx
dec edi
mov ebp, ecx
add ebp, edx
cmp ebp, r14d
jl short loc_100000E90
inc r8
add r13, 4
inc rdx
cmp r8, [rsp+68h+var_50]
jl short loc_100000E7A
mov r9d, 1
db 66h, 66h, 66h, 66h, 2Eh
nop word ptr [rax+rax+00000000h]
loc_100000EE0: ; CODE XREF: _main+208 j
; _main+22E j ...
mov rbp, rbx
add ebp, r9d
cmp r15d, r11d
lea ecx, [r15+1]
mov rdx, [rsp+68h+var_40]
mov [rax+rdx*4], ecx
mov r15d, ecx
jl loc_100000E50
jmp short loc_100000F09
; ---------------------------------------------------------------------------
loc_100000F00: ; CODE XREF: _main+1AD j
xor ebp, ebp
test r11d, r11d
cmovns ebp, r11d
loc_100000F09: ; CODE XREF: _main+E0 j
; _main+1C1 j ...
mov rdi, rax ; void *
call _free
lea rdi, aDSolutions ; "%d solutions\n"
xor ebx, ebx
xor eax, eax
mov esi, ebp
call _printf
loc_100000F23: ; CODE XREF: _main+16 j
mov eax, ebx
add rsp, 38h
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
retn
_main endp
```
gcc-4.9.2's result:
```
_main proc near
var_48 = qword ptr -48h
var_40 = dword ptr -40h
var_3C = dword ptr -3Ch
cmp edi, 2
jz short loc_100000D69
or eax, 0FFFFFFFFh
retn
; ---------------------------------------------------------------------------
loc_100000D69: ; CODE XREF: _main+3 j
push r15
mov edx, 0Ah ; int
push r14
push r13
push r12
push rbp
push rbx
sub rsp, 18h
mov rdi, [rsi+8] ; char *
xor esi, esi ; char **
call _strtol
mov edi, 4 ; size_t
lea esi, [rax+1]
mov r14, rax
mov ebx, eax
lea r15d, [r14-2]
movsxd rsi, esi ; size_t
call _calloc
mov [rsp+48h+var_3C], 0
mov rdi, rax ; void *
lea eax, [r14-1]
cdqe
lea r13, [rdi+rax*4]
movsxd rax, r15d
mov ebp, [r13+0]
shl rax, 2
lea r12, [rdi+rax]
lea rax, [rdi+rax-4]
mov [rsp+48h+var_48], rax
mov eax, r14d
lea r14d, [r14+1]
nop word ptr [rax+rax+00h]
nop word ptr [rax+rax+00h]
loc_100000DE0: ; CODE XREF: _main+12B j
; _main+155 j ...
add ebp, 1
cmp ebx, ebp
mov [r13+0], ebp
jg short loc_100000E62
test r15d, r15d
js short loc_100000E33
mov ecx, [r12]
lea edx, [rcx+1]
cmp ebx, edx
mov [r12], edx
jg short loc_100000E58
mov r8, r12
mov rcx, [rsp+48h+var_48]
mov esi, r15d
jmp short loc_100000E24
; ---------------------------------------------------------------------------
align 10h
loc_100000E10: ; CODE XREF: _main+D1 j
mov edx, [rcx]
sub r8, 4
sub rcx, 4
add edx, 1
mov [rcx+4], edx
cmp ebx, edx
jg short loc_100000E58
loc_100000E24: ; CODE XREF: _main+A9 j
sub esi, 1
mov dword ptr [r8], 0
cmp esi, 0FFFFFFFFh
jnz short loc_100000E10
loc_100000E33: ; CODE XREF: _main+8E j
call _free
mov esi, [rsp+48h+var_3C]
add rsp, 18h
xor eax, eax
pop rbx
lea rdi, aDSolutions ; "%d solutions\n"
pop rbp
pop r12
pop r13
pop r14
pop r15
jmp _printf
; ---------------------------------------------------------------------------
loc_100000E58: ; CODE XREF: _main+9D j
; _main+C2 j
mov dword ptr [r13+0], 0
xor ebp, ebp
loc_100000E62: ; CODE XREF: _main+89 j
test ebx, ebx
jle loc_100000EE6
lea r11, [rdi+8]
xor r10d, r10d
loc_100000E71: ; CODE XREF: _main+184 j
add r10d, 1
cmp r10d, eax
jz short loc_100000EE6
mov r8d, [r11-8]
mov edx, r8d
sub edx, [r11-4]
add edx, 1
cmp edx, 2
jbe loc_100000DE0
mov r9d, r14d
mov rcx, r11
mov edx, 1
mov [rsp+48h+var_40], r10d
sub r9d, r10d
jmp short loc_100000ED3
; ---------------------------------------------------------------------------
align 10h
loc_100000EB0: ; CODE XREF: _main+179 j
mov esi, r8d
sub esi, [rcx]
jz loc_100000DE0
mov r10d, esi
add rcx, 4
add r10d, edx
jz loc_100000DE0
cmp esi, edx
jz loc_100000DE0
loc_100000ED3: ; CODE XREF: _main+144 j
add edx, 1
cmp edx, r9d
jnz short loc_100000EB0
mov r10d, [rsp+48h+var_40]
add r11, 4
jmp short loc_100000E71
; ---------------------------------------------------------------------------
loc_100000EE6: ; CODE XREF: _main+104 j
; _main+118 j
add [rsp+48h+var_3C], 1
jmp loc_100000DE0
_main endp
```
MSVC 10.0's result:
```
_main proc near ; CODE XREF: ___tmainCRTStartup+106 p
var_80 = dword ptr -80h
var_7C = dword ptr -7Ch
var_78 = dword ptr -78h
var_74 = dword ptr -74h
var_70 = dword ptr -70h
var_6C = dword ptr -6Ch
var_68 = dword ptr -68h
var_64 = dword ptr -64h
var_60 = dword ptr -60h
var_5C = dword ptr -5Ch
argc = dword ptr 8
argv = dword ptr 0Ch
envp = dword ptr 10h
push ebp
mov ebp, esp
and esp, 0FFFFFF80h
push esi
push edi
push ebx
sub esp, 74h
push 3
call sub_4080F0
add esp, 4
stmxcsr [esp+80h+var_80]
or [esp+80h+var_80], 8000h
ldmxcsr [esp+80h+var_80]
cmp [ebp+argc], 2
jz short loc_40103A
mov eax, 0FFFFFFFFh
add esp, 74h
pop ebx
pop edi
pop esi
mov esp, ebp
pop ebp
retn
; ---------------------------------------------------------------------------
loc_40103A: ; CODE XREF: _main+29 j
call ds:GetTickCount
mov esi, eax
mov eax, [ebp+argv]
push dword ptr [eax+4] ; char *
call _atoi
mov edi, eax
lea eax, [edi+1]
push eax ; size_t
push 4 ; size_t
call _calloc
add esp, 0Ch
mov ecx, [eax+edi*4-4]
lea edx, [edi-1]
mov [esp+80h+var_6C], ecx
xor ebx, ebx
mov [esp+80h+var_7C], ebx
lea ecx, [eax+edi*4]
mov [esp+80h+var_74], ecx
lea ecx, [edi-2]
mov [esp+80h+var_70], ecx
mov [esp+80h+var_60], edx
mov [esp+80h+var_80], esi
mov ecx, [esp+80h+var_6C]
loc_401087: ; CODE XREF: _main+142 j
; _main+193 j
mov edx, [esp+80h+var_60]
inc ecx
mov [eax+edi*4-4], ecx
cmp edi, [eax+edx*4]
jg short loc_4010DC
mov esi, [esp+80h+var_70]
test esi, esi
js short loc_4010CE
xor edx, edx
mov [esp+80h+var_78], eax
xor ebx, ebx
mov eax, [esp+80h+var_74]
loc_4010A9: ; CODE XREF: _main+C8 j
mov ecx, [eax+ebx*4-8]
inc ecx
cmp ecx, edi
jl loc_40117A
inc edx
lea esi, [ebx+edi-3]
mov dword ptr [eax+ebx*4-8], 0
dec ebx
cmp edx, [esp+80h+var_60]
jb short loc_4010A9
mov eax, [esp+80h+var_78]
loc_4010CE: ; CODE XREF: _main+9B j
; _main+186 j
test esi, esi
jl short loc_401147
mov dword ptr [eax+edi*4-4], 0
xor ecx, ecx
loc_4010DC: ; CODE XREF: _main+93 j
test edi, edi
jle short loc_40113E
mov [esp+80h+var_6C], ecx
xor edx, edx
mov [esp+80h+var_5C], edi
loc_4010EA: ; CODE XREF: _main+132 j
lea ecx, [edx+1]
mov ebx, ecx
mov esi, ebx
cmp ecx, [esp+80h+var_5C]
jge short loc_401130
mov edx, [eax+edx*4]
mov edi, 1
mov [esp+80h+var_64], esi
mov [esp+80h+var_68], ecx
loc_401107: ; CODE XREF: _main+122 j
mov esi, [eax+ebx*4]
cmp edx, esi
jz short loc_40118B
sub esi, edx
mov ecx, esi
neg ecx
cmp edi, ecx
jz short loc_40118B
cmp esi, edi
jz short loc_40118B
inc ebx
inc edi
cmp ebx, [esp+80h+var_5C]
jl short loc_401107
mov ecx, [esp+80h+var_68]
mov esi, [esp+80h+var_64]
cmp ecx, [esp+80h+var_5C]
loc_401130: ; CODE XREF: _main+F5 j
mov edx, esi
jl short loc_4010EA
xchg ax, ax
mov ecx, [esp+80h+var_6C]
mov edi, [esp+80h+var_5C]
loc_40113E: ; CODE XREF: _main+DE j
inc [esp+80h+var_7C]
jmp loc_401087
; ---------------------------------------------------------------------------
loc_401147: ; CODE XREF: _main+D0 j
mov ebx, [esp+80h+var_7C]
mov esi, [esp+80h+var_80]
push eax ; void *
call _free
add esp, 4
call ds:GetTickCount
sub eax, esi
push eax
push ebx
push offset aDSolutionsInDM ; "%d solutions in %d
msecs.\n"
call _printf
xor eax, eax
add esp, 80h
pop ebx
pop edi
pop esi
mov esp, ebp
pop ebp
retn
; ---------------------------------------------------------------------------
loc_40117A: ; CODE XREF: _main+B0 j
mov edx, [esp+80h+var_74]
mov eax, [esp+80h+var_78]
mov [edx+ebx*4-8], ecx
jmp loc_4010CE
; ---------------------------------------------------------------------------
loc_40118B: ; CODE XREF: _main+10C j
; _main+116 j ...
mov ecx, [esp+80h+var_6C]
mov edi, [esp+80h+var_5C]
jmp loc_401087
_main endp
```
Jack Howarth
2015-Feb-13 19:47 UTC
[LLVMdev] trunk's optimizer generates slower code than 3.5
Also confirmed with the llvm 3.5.1 release and the llvm 3.6 release branch on x86_64-apple-darwin14... % clang-3.5 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c % time ./8 9 352 solutions 3.603u 0.002s 0:03.60 100.0% 0+0k 0+0io 2pf+0w % time ./8 10 724 solutions 104.217u 0.059s 1:44.30 99.9% 0+0k 0+0io 2pf+0w % clang-3.6 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector -fno-exceptions -o 8 8.c % time ./8 9 352 solutions 4.050u 0.001s 0:04.05 100.0% 0+0k 0+0io 2pf+0w % time ./8 10 724 solutions 114.808u 0.041s 1:54.86 99.9% 0+0k 0+0io 2pf+0w On Fri, Feb 13, 2015 at 3:37 AM, 191919 <191919 at gmail.com> wrote:> I submitted the problem report to clang's bugzilla but no one seems to > care so I have to send it to the mailing list. > > clang 3.7 svn (trunk 229055 as the time I was to report this problem) > generates slower code than 3.5 (Apple LLVM version 6.0 > (clang-600.0.56) (based on LLVM 3.5svn)) for the following code. > > It is a "8 queens puzzle" solver written as an educational example. As > compiled by both clang 3.5 and 3.7, it gave the correct answer, but > clang 3.5 generates code which runs 20% faster than 3.6/3.7. > > ########################################## > # clang 3.5 which comes with Xcode 6.1.1 > ########################################## > $ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 # 9 queens > 352 solutions > $ time ./8 10 # 10 queens > ./8 9 1.63s user 0.00s system 99% cpu 1.632 total > 724 solutions > ./8 10 45.11s user 0.01s system 99% cpu 45.121 total > > ########################################## > # clang 3.7 svn trunk > ########################################## > $ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 # 9 queens > 352 solutions > ./8 9 2.07s user 0.00s system 99% cpu 2.078 total > $ time ./8 10 # 10 queens > 724 solutions > ./8 10 56.63s user 0.02s system 99% cpu 56.650 total > > The source code is below, I also attached the executable files as well > as the assembly code files for clang 3.5 and 3.6 by IDA. > > The performance is even worse when compiling as 32-bit code while > gcc-4.9.2 is not affected. > > ########## clang-3.5 > $ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 > 352 solutions > ./8 9 1.95s user 0.00s system 99% cpu 1.950 total > > ########## clang-3.7 > $ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 > 352 solutions > ./8 9 2.48s user 0.00s system 99% cpu 2.480 total > > ######### gcc-4.9.2 > $ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector > -fno-exceptions -o 8 8.c > $ time ./8 9 > 352 solutions > ./8 9 1.44s user 0.00s system 99% cpu 1.442 total > > > ``` > #include <stdio.h> > #include <stdlib.h> > > static inline int validate(int* a, int d) > { > int i, j, x; > for (i = 0; i < d; ++i) > { > for (j = i+1, x = 1; j < d; ++j, ++x) > { > const int d = a[i] - a[j]; > if (d == 0 || d == -x || d == x) return 0; > } > } > return 1; > } > > static inline int solve(int d) > { > int r = 0; > int* a = (int*) calloc(sizeof(int), d+1); > int p = d - 1; > > for (;;) > { > a[p]++; > > if (a[p] > d-1) > { > int bp = p - 1; > while (bp >= 0) > { > a[bp]++; > if (a[bp] <= d-1) break; > a[bp] = 0; > --bp; > } > if (bp < 0) > break; > a[p] = 0; > } > if (validate(a, d)) > { > ++r; > } > } > > free(a); > return r; > } > > int main(int argc, char** argv) > { > if (argc != 2) return -1; > int r = solve((int) strtol(argv[1], NULL, 10)); > printf("%d solutions\n", r); > } > ``` > > clang 3.5's result: > > ``` > public _main > _main proc near > > var_48 = qword ptr -48h > var_40 = qword ptr -40h > var_34 = dword ptr -34h > > push rbp > push r15 > push r14 > push r13 > push r12 > push rbx > sub rsp, 18h > mov ebx, 0FFFFFFFFh > cmp edi, 2 > jnz loc_100000F29 > mov rdi, [rsi+8] ; char * > xor r14d, r14d > xor esi, esi ; char ** > mov edx, 0Ah ; int > call _strtol > mov r15, rax > shl rax, 20h > mov rsi, offset __mh_execute_header > add rsi, rax > sar rsi, 20h ; size_t > mov edi, 4 ; size_t > call _calloc > lea edx, [r15-1] > movsxd r8, edx > mov ecx, r15d > add ecx, 0FFFFFFFEh > js loc_100000DFA > test r15d, r15d > mov r11d, [rax+r8*4] > jle loc_100000EAE > mov ecx, r15d > add ecx, 0FFFFFFFEh > mov [rsp+48h+var_34], ecx > movsxd rcx, ecx > lea rcx, [rax+rcx*4] > mov [rsp+48h+var_40], rcx > lea rcx, [rax+4] > mov [rsp+48h+var_48], rcx > xor r14d, r14d > jmp short loc_100000D33 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000D30: ; CODE XREF: _main+129 j > ; _main+131 j ... > add r14d, ebx > > loc_100000D33: ; CODE XREF: _main+92 j > cmp r11d, edx > lea edi, [r11+1] > mov [rax+r8*4], edi > mov rcx, [rsp+48h+var_40] > mov esi, [rsp+48h+var_34] > mov r11d, edi > jl short loc_100000D84 > nop dword ptr [rax+00h] > > loc_100000D50: ; CODE XREF: _main+DA j > mov edi, [rcx] > lea ebp, [rdi+1] > mov [rcx], ebp > cmp edi, edx > jl short loc_100000D71 > mov dword ptr [rcx], 0 > add rcx, 0FFFFFFFFFFFFFFFCh > test esi, esi > lea esi, [rsi-1] > jg short loc_100000D50 > jmp loc_100000F0E > ; --------------------------------------------------------------------------- > > loc_100000D71: ; CODE XREF: _main+C9 j > test esi, esi > js loc_100000F0E > mov dword ptr [rax+r8*4], 0 > xor r11d, r11d > > loc_100000D84: ; CODE XREF: _main+BA j > cmp r15d, 1 > mov esi, 0 > mov r9, [rsp+48h+var_48] > mov r12d, 1 > jle short loc_100000DF0 > > loc_100000D99: ; CODE XREF: _main+15E j > mov r10d, [rax+rsi*4] > mov ecx, 0FFFFFFFFh > mov edi, 1 > mov r13, r9 > nop word ptr [rax+rax+00h] > > loc_100000DB0: ; CODE XREF: _main+14F j > xor ebx, ebx > mov ebp, r10d > sub ebp, [r13+0] > jz loc_100000D30 > cmp ecx, ebp > jz loc_100000D30 > cmp edi, ebp > jz loc_100000D30 > add r13, 4 > inc rdi > dec ecx > mov ebx, edi > add ebx, esi > cmp ebx, r15d > jl short loc_100000DB0 > inc r12 > add r9, 4 > inc rsi > cmp r12d, r15d > jl short loc_100000D99 > > loc_100000DF0: ; CODE XREF: _main+107 j > mov ebx, 1 > jmp loc_100000D30 > ; --------------------------------------------------------------------------- > > loc_100000DFA: ; CODE XREF: _main+5E j > mov ecx, [rax+r8*4] > lea r9d, [rcx+1] > mov [rax+r8*4], r9d > cmp ecx, r8d > jge loc_100000F0E > lea r12, [rax+4] > xor r14d, r14d > db 2Eh > nop word ptr [rax+rax+00000000h] > > loc_100000E20: ; CODE XREF: _main+216 j > test r15d, r15d > setle cl > cmp r15d, 2 > jl short loc_100000E90 > test cl, cl > mov r13d, 0 > mov r11, r12 > mov r10d, 1 > jnz short loc_100000E90 > > loc_100000E3F: ; CODE XREF: _main+1F0 j > mov edi, [rax+r13*4] > mov edx, 0FFFFFFFFh > mov ecx, 1 > mov rsi, r11 > > loc_100000E50: ; CODE XREF: _main+1E1 j > xor ebx, ebx > mov ebp, edi > sub ebp, [rsi] > jz short loc_100000E95 > cmp edx, ebp > jz short loc_100000E95 > cmp ecx, ebp > jz short loc_100000E95 > add rsi, 4 > inc rcx > dec edx > mov ebx, ecx > add ebx, r13d > cmp ebx, r15d > jl short loc_100000E50 > inc r10 > add r11, 4 > inc r13 > cmp r10d, r15d > jl short loc_100000E3F > db 66h, 66h, 66h, 66h, 2Eh > nop word ptr [rax+rax+00000000h] > > loc_100000E90: ; CODE XREF: _main+19A j > ; _main+1AD j > mov ebx, 1 > > loc_100000E95: ; CODE XREF: _main+1C6 j > ; _main+1CA j ... > add r14d, ebx > cmp r9d, r8d > lea ecx, [r9+1] > mov [rax+r8*4], ecx > mov r9d, ecx > jl loc_100000E20 > jmp short loc_100000F0E > ; --------------------------------------------------------------------------- > > loc_100000EAE: ; CODE XREF: _main+6B j > add r15d, 0FFFFFFFEh > movsxd rcx, r15d > lea rcx, [rax+rcx*4] > xor r14d, r14d > jmp short loc_100000EC6 > ; --------------------------------------------------------------------------- > align 20h > > loc_100000EC0: ; CODE XREF: _main+247 j > ; _main+27C j > inc r14d > mov r11d, ebp > > loc_100000EC6: ; CODE XREF: _main+22C j > lea ebp, [r11+1] > mov [rax+r8*4], ebp > cmp r11d, r8d > mov rsi, rcx > mov edi, r15d > jl short loc_100000EC0 > nop dword ptr [rax+00000000h] > > loc_100000EE0: ; CODE XREF: _main+26A j > mov ebp, [rsi] > lea ebx, [rbp+1] > mov [rsi], ebx > cmp ebp, edx > jl short loc_100000EFE > mov dword ptr [rsi], 0 > add rsi, 0FFFFFFFFFFFFFFFCh > test edi, edi > lea edi, [rdi-1] > jg short loc_100000EE0 > jmp short loc_100000F0E > ; --------------------------------------------------------------------------- > > loc_100000EFE: ; CODE XREF: _main+259 j > test edi, edi > js short loc_100000F0E > mov dword ptr [rax+r8*4], 0 > xor ebp, ebp > jmp short loc_100000EC0 > ; --------------------------------------------------------------------------- > > loc_100000F0E: ; CODE XREF: _main+DC j > ; _main+E3 j ... > mov rdi, rax ; void * > call _free > lea rdi, aDSolutions ; "%d solutions\n" > xor ebx, ebx > xor eax, eax > mov esi, r14d > call _printf > > loc_100000F29: ; CODE XREF: _main+16 j > mov eax, ebx > add rsp, 18h > pop rbx > pop r12 > pop r13 > pop r14 > pop r15 > pop rbp > retn > _main endp > ``` > > clang 3.6's result: > > ``` > public _main > _main proc near > > var_60 = qword ptr -60h > var_58 = qword ptr -58h > var_50 = qword ptr -50h > var_48 = qword ptr -48h > var_40 = qword ptr -40h > var_38 = qword ptr -38h > > push rbp > push r15 > push r14 > push r13 > push r12 > push rbx > sub rsp, 38h > mov ebx, 0FFFFFFFFh > cmp edi, 2 > jnz loc_100000F23 > mov rbx, offset __mh_execute_header > mov rdi, [rsi+8] ; char * > xor r13d, r13d > xor esi, esi ; char ** > mov edx, 0Ah ; int > call _strtol > mov r14, rax > shl rax, 20h > mov [rsp+68h+var_38], rax > lea rsi, [rax+rbx] > sar rsi, 20h ; size_t > mov edi, 4 ; size_t > call _calloc > lea r11d, [r14-1] > movsxd r12, r11d > mov [rsp+68h+var_40], r12 > movsxd rcx, r14d > mov [rsp+68h+var_50], rcx > add ecx, 0FFFFFFFEh > js loc_100000E1A > mov ecx, r14d > add ecx, 0FFFFFFFEh > movsxd rcx, ecx > inc rcx > mov [rsp+68h+var_58], rcx > mov rcx, rax > add rcx, 4 > mov [rsp+68h+var_60], rcx > xor ebp, ebp > jmp short loc_100000D17 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000D10: ; CODE XREF: _main+15B j > ; _main+163 j ... > mov rbp, [rsp+68h+var_48] > add ebp, edi > > loc_100000D17: ; CODE XREF: _main+93 j > cmp r13d, r11d > lea edx, [r13+1] > mov [rax+r12*4], edx > mov rcx, [rsp+68h+var_58] > mov r13d, edx > jl short loc_100000D6B > nop dword ptr [rax+00h] > > loc_100000D30: ; CODE XREF: _main+DE j > mov edx, [rax+rcx*4-4] > lea esi, [rdx+1] > mov [rax+rcx*4-4], esi > cmp edx, r11d > jl short loc_100000D60 > mov dword ptr [rax+rcx*4-4], 0 > dec rcx > test rcx, rcx > jg short loc_100000D30 > jmp loc_100000F09 > ; --------------------------------------------------------------------------- > align 20h > > loc_100000D60: ; CODE XREF: _main+CE j > mov dword ptr [rax+r12*4], 0 > xor r13d, r13d > > loc_100000D6B: ; CODE XREF: _main+BA j > mov [rsp+68h+var_48], rbp > test r14d, r14d > setle cl > mov rdx, offset __mh_execute_header > lea rdx, [rdx+1] > cmp [rsp+68h+var_38], rdx > jl loc_100000E10 > test cl, cl > mov edx, 0 > mov r10, [rsp+68h+var_60] > mov r9d, 1 > jnz short loc_100000E10 > > loc_100000DA3: ; CODE XREF: _main+195 j > mov esi, [rax+rdx*4] > mov r15d, 0FFFFFFFFh > mov r8d, 1 > mov rcx, r10 > db 66h, 66h, 2Eh > nop dword ptr [rax+rax+00000000h] > > loc_100000DC0: ; CODE XREF: _main+184 j > mov ebx, [rcx] > mov ebp, esi > sub ebp, ebx > xor edi, edi > cmp r8d, ebp > jz loc_100000D10 > cmp esi, ebx > jz loc_100000D10 > cmp r15d, ebp > jz loc_100000D10 > add rcx, 4 > inc r8 > dec r15d > mov edi, r8d > add edi, edx > cmp edi, r14d > jl short loc_100000DC0 > inc r9 > add r10, 4 > inc rdx > cmp r9, [rsp+68h+var_50] > jl short loc_100000DA3 > nop word ptr [rax+rax+00000000h] > > loc_100000E10: ; CODE XREF: _main+119 j > ; _main+131 j > mov edi, 1 > jmp loc_100000D10 > ; --------------------------------------------------------------------------- > > loc_100000E1A: ; CODE XREF: _main+6E j > test r14d, r14d > jle loc_100000F00 > mov dword ptr [rax+r12*4], 1 > xor ebp, ebp > cmp r14d, 2 > jl loc_100000F09 > mov rcx, rax > add rcx, 4 > mov [rsp+68h+var_48], rcx > xor ebp, ebp > mov r15d, 1 > nop dword ptr [rax+rax+00h] > > loc_100000E50: ; CODE XREF: _main+288 j > mov rbx, rbp > mov rcx, offset __mh_execute_header > cmp [rsp+68h+var_38], rcx > mov edx, 0 > mov r13, [rsp+68h+var_48] > mov r8d, 1 > mov r9d, 1 > jle short loc_100000EE0 > > loc_100000E7A: ; CODE XREF: _main+25A j > mov r12d, [rax+rdx*4] > mov edi, 0FFFFFFFFh > mov ecx, 1 > mov rsi, r13 > nop dword ptr [rax+rax+00h] > > loc_100000E90: ; CODE XREF: _main+249 j > mov r10d, [rsi] > mov ebp, r12d > sub ebp, r10d > xor r9d, r9d > cmp ecx, ebp > jz short loc_100000EE0 > cmp r12d, r10d > jz short loc_100000EE0 > cmp edi, ebp > jz short loc_100000EE0 > add rsi, 4 > inc rcx > dec edi > mov ebp, ecx > add ebp, edx > cmp ebp, r14d > jl short loc_100000E90 > inc r8 > add r13, 4 > inc rdx > cmp r8, [rsp+68h+var_50] > jl short loc_100000E7A > mov r9d, 1 > db 66h, 66h, 66h, 66h, 2Eh > nop word ptr [rax+rax+00000000h] > > loc_100000EE0: ; CODE XREF: _main+208 j > ; _main+22E j ... > mov rbp, rbx > add ebp, r9d > cmp r15d, r11d > lea ecx, [r15+1] > mov rdx, [rsp+68h+var_40] > mov [rax+rdx*4], ecx > mov r15d, ecx > jl loc_100000E50 > jmp short loc_100000F09 > ; --------------------------------------------------------------------------- > > loc_100000F00: ; CODE XREF: _main+1AD j > xor ebp, ebp > test r11d, r11d > cmovns ebp, r11d > > loc_100000F09: ; CODE XREF: _main+E0 j > ; _main+1C1 j ... > mov rdi, rax ; void * > call _free > lea rdi, aDSolutions ; "%d solutions\n" > xor ebx, ebx > xor eax, eax > mov esi, ebp > call _printf > > loc_100000F23: ; CODE XREF: _main+16 j > mov eax, ebx > add rsp, 38h > pop rbx > pop r12 > pop r13 > pop r14 > pop r15 > pop rbp > retn > _main endp > ``` > > gcc-4.9.2's result: > ``` > > _main proc near > > var_48 = qword ptr -48h > var_40 = dword ptr -40h > var_3C = dword ptr -3Ch > > cmp edi, 2 > jz short loc_100000D69 > or eax, 0FFFFFFFFh > retn > ; --------------------------------------------------------------------------- > > loc_100000D69: ; CODE XREF: _main+3 j > push r15 > mov edx, 0Ah ; int > push r14 > push r13 > push r12 > push rbp > push rbx > sub rsp, 18h > mov rdi, [rsi+8] ; char * > xor esi, esi ; char ** > call _strtol > mov edi, 4 ; size_t > lea esi, [rax+1] > mov r14, rax > mov ebx, eax > lea r15d, [r14-2] > movsxd rsi, esi ; size_t > call _calloc > mov [rsp+48h+var_3C], 0 > mov rdi, rax ; void * > lea eax, [r14-1] > cdqe > lea r13, [rdi+rax*4] > movsxd rax, r15d > mov ebp, [r13+0] > shl rax, 2 > lea r12, [rdi+rax] > lea rax, [rdi+rax-4] > mov [rsp+48h+var_48], rax > mov eax, r14d > lea r14d, [r14+1] > nop word ptr [rax+rax+00h] > nop word ptr [rax+rax+00h] > > loc_100000DE0: ; CODE XREF: _main+12B j > ; _main+155 j ... > add ebp, 1 > cmp ebx, ebp > mov [r13+0], ebp > jg short loc_100000E62 > test r15d, r15d > js short loc_100000E33 > mov ecx, [r12] > lea edx, [rcx+1] > cmp ebx, edx > mov [r12], edx > jg short loc_100000E58 > mov r8, r12 > mov rcx, [rsp+48h+var_48] > mov esi, r15d > jmp short loc_100000E24 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000E10: ; CODE XREF: _main+D1 j > mov edx, [rcx] > sub r8, 4 > sub rcx, 4 > add edx, 1 > mov [rcx+4], edx > cmp ebx, edx > jg short loc_100000E58 > > loc_100000E24: ; CODE XREF: _main+A9 j > sub esi, 1 > mov dword ptr [r8], 0 > cmp esi, 0FFFFFFFFh > jnz short loc_100000E10 > > loc_100000E33: ; CODE XREF: _main+8E j > call _free > mov esi, [rsp+48h+var_3C] > add rsp, 18h > xor eax, eax > pop rbx > lea rdi, aDSolutions ; "%d solutions\n" > pop rbp > pop r12 > pop r13 > pop r14 > pop r15 > jmp _printf > ; --------------------------------------------------------------------------- > > loc_100000E58: ; CODE XREF: _main+9D j > ; _main+C2 j > mov dword ptr [r13+0], 0 > xor ebp, ebp > > loc_100000E62: ; CODE XREF: _main+89 j > test ebx, ebx > jle loc_100000EE6 > lea r11, [rdi+8] > xor r10d, r10d > > loc_100000E71: ; CODE XREF: _main+184 j > add r10d, 1 > cmp r10d, eax > jz short loc_100000EE6 > mov r8d, [r11-8] > mov edx, r8d > sub edx, [r11-4] > add edx, 1 > cmp edx, 2 > jbe loc_100000DE0 > mov r9d, r14d > mov rcx, r11 > mov edx, 1 > mov [rsp+48h+var_40], r10d > sub r9d, r10d > jmp short loc_100000ED3 > ; --------------------------------------------------------------------------- > align 10h > > loc_100000EB0: ; CODE XREF: _main+179 j > mov esi, r8d > sub esi, [rcx] > jz loc_100000DE0 > mov r10d, esi > add rcx, 4 > add r10d, edx > jz loc_100000DE0 > cmp esi, edx > jz loc_100000DE0 > > loc_100000ED3: ; CODE XREF: _main+144 j > add edx, 1 > cmp edx, r9d > jnz short loc_100000EB0 > mov r10d, [rsp+48h+var_40] > add r11, 4 > jmp short loc_100000E71 > ; --------------------------------------------------------------------------- > > loc_100000EE6: ; CODE XREF: _main+104 j > ; _main+118 j > add [rsp+48h+var_3C], 1 > jmp loc_100000DE0 > _main endp > ``` > > MSVC 10.0's result: > > ``` > > _main proc near ; CODE XREF: ___tmainCRTStartup+106 p > > var_80 = dword ptr -80h > var_7C = dword ptr -7Ch > var_78 = dword ptr -78h > var_74 = dword ptr -74h > var_70 = dword ptr -70h > var_6C = dword ptr -6Ch > var_68 = dword ptr -68h > var_64 = dword ptr -64h > var_60 = dword ptr -60h > var_5C = dword ptr -5Ch > argc = dword ptr 8 > argv = dword ptr 0Ch > envp = dword ptr 10h > > push ebp > mov ebp, esp > and esp, 0FFFFFF80h > push esi > push edi > push ebx > sub esp, 74h > push 3 > call sub_4080F0 > add esp, 4 > stmxcsr [esp+80h+var_80] > or [esp+80h+var_80], 8000h > ldmxcsr [esp+80h+var_80] > cmp [ebp+argc], 2 > jz short loc_40103A > mov eax, 0FFFFFFFFh > add esp, 74h > pop ebx > pop edi > pop esi > mov esp, ebp > pop ebp > retn > ; --------------------------------------------------------------------------- > > loc_40103A: ; CODE XREF: _main+29 j > call ds:GetTickCount > mov esi, eax > mov eax, [ebp+argv] > push dword ptr [eax+4] ; char * > call _atoi > mov edi, eax > lea eax, [edi+1] > push eax ; size_t > push 4 ; size_t > call _calloc > add esp, 0Ch > mov ecx, [eax+edi*4-4] > lea edx, [edi-1] > mov [esp+80h+var_6C], ecx > xor ebx, ebx > mov [esp+80h+var_7C], ebx > lea ecx, [eax+edi*4] > mov [esp+80h+var_74], ecx > lea ecx, [edi-2] > mov [esp+80h+var_70], ecx > mov [esp+80h+var_60], edx > mov [esp+80h+var_80], esi > mov ecx, [esp+80h+var_6C] > > loc_401087: ; CODE XREF: _main+142 j > ; _main+193 j > mov edx, [esp+80h+var_60] > inc ecx > mov [eax+edi*4-4], ecx > cmp edi, [eax+edx*4] > jg short loc_4010DC > mov esi, [esp+80h+var_70] > test esi, esi > js short loc_4010CE > xor edx, edx > mov [esp+80h+var_78], eax > xor ebx, ebx > mov eax, [esp+80h+var_74] > > loc_4010A9: ; CODE XREF: _main+C8 j > mov ecx, [eax+ebx*4-8] > inc ecx > cmp ecx, edi > jl loc_40117A > inc edx > lea esi, [ebx+edi-3] > mov dword ptr [eax+ebx*4-8], 0 > dec ebx > cmp edx, [esp+80h+var_60] > jb short loc_4010A9 > mov eax, [esp+80h+var_78] > > loc_4010CE: ; CODE XREF: _main+9B j > ; _main+186 j > test esi, esi > jl short loc_401147 > mov dword ptr [eax+edi*4-4], 0 > xor ecx, ecx > > loc_4010DC: ; CODE XREF: _main+93 j > test edi, edi > jle short loc_40113E > mov [esp+80h+var_6C], ecx > xor edx, edx > mov [esp+80h+var_5C], edi > > loc_4010EA: ; CODE XREF: _main+132 j > lea ecx, [edx+1] > mov ebx, ecx > mov esi, ebx > cmp ecx, [esp+80h+var_5C] > jge short loc_401130 > mov edx, [eax+edx*4] > mov edi, 1 > mov [esp+80h+var_64], esi > mov [esp+80h+var_68], ecx > > loc_401107: ; CODE XREF: _main+122 j > mov esi, [eax+ebx*4] > cmp edx, esi > jz short loc_40118B > sub esi, edx > mov ecx, esi > neg ecx > cmp edi, ecx > jz short loc_40118B > cmp esi, edi > jz short loc_40118B > inc ebx > inc edi > cmp ebx, [esp+80h+var_5C] > jl short loc_401107 > mov ecx, [esp+80h+var_68] > mov esi, [esp+80h+var_64] > cmp ecx, [esp+80h+var_5C] > > loc_401130: ; CODE XREF: _main+F5 j > mov edx, esi > jl short loc_4010EA > xchg ax, ax > mov ecx, [esp+80h+var_6C] > mov edi, [esp+80h+var_5C] > > loc_40113E: ; CODE XREF: _main+DE j > inc [esp+80h+var_7C] > jmp loc_401087 > ; --------------------------------------------------------------------------- > > loc_401147: ; CODE XREF: _main+D0 j > mov ebx, [esp+80h+var_7C] > mov esi, [esp+80h+var_80] > push eax ; void * > call _free > add esp, 4 > call ds:GetTickCount > sub eax, esi > push eax > push ebx > push offset aDSolutionsInDM ; "%d solutions in %d msecs.\n" > call _printf > xor eax, eax > add esp, 80h > pop ebx > pop edi > pop esi > mov esp, ebp > pop ebp > retn > ; --------------------------------------------------------------------------- > > loc_40117A: ; CODE XREF: _main+B0 j > mov edx, [esp+80h+var_74] > mov eax, [esp+80h+var_78] > mov [edx+ebx*4-8], ecx > jmp loc_4010CE > ; --------------------------------------------------------------------------- > > loc_40118B: ; CODE XREF: _main+10C j > ; _main+116 j ... > mov ecx, [esp+80h+var_6C] > mov edi, [esp+80h+var_5C] > jmp loc_401087 > _main endp > ``` > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Jack Howarth
2015-Feb-14 16:13 UTC
[LLVMdev] trunk's optimizer generates slower code than 3.5
The regressions in the performance of generated code, introduced
by the llvm 3.6 release, don't seem to be limited to this 8 queens
puzzle" solver test case. See...
http://www.phoronix.com/scan.php?page=article&item=llvm-clang-3.5-3.6-rc1&num=1
where a bit hit in the performance of the Sparse Matrix Multiply test
of the SciMark v2.0 benchmark was observed as well as others.
Do you really want to release 3.6 with this level of performance regression?
Jack
On Fri, Feb 13, 2015 at 2:47 PM, Jack Howarth
<howarth.mailing.lists at gmail.com> wrote:> Also confirmed with the llvm 3.5.1 release and the llvm 3.6 release
> branch on x86_64-apple-darwin14...
>
> % clang-3.5 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
> -fno-exceptions -o 8 8.c
> % time ./8 9
> 352 solutions
> 3.603u 0.002s 0:03.60 100.0% 0+0k 0+0io 2pf+0w
> % time ./8 10
> 724 solutions
> 104.217u 0.059s 1:44.30 99.9% 0+0k 0+0io 2pf+0w
>
> % clang-3.6 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
> -fno-exceptions -o 8 8.c
> % time ./8 9
> 352 solutions
> 4.050u 0.001s 0:04.05 100.0% 0+0k 0+0io 2pf+0w
> % time ./8 10
> 724 solutions
> 114.808u 0.041s 1:54.86 99.9% 0+0k 0+0io 2pf+0w
>
> On Fri, Feb 13, 2015 at 3:37 AM, 191919 <191919 at gmail.com> wrote:
>> I submitted the problem report to clang's bugzilla but no one seems
to
>> care so I have to send it to the mailing list.
>>
>> clang 3.7 svn (trunk 229055 as the time I was to report this problem)
>> generates slower code than 3.5 (Apple LLVM version 6.0
>> (clang-600.0.56) (based on LLVM 3.5svn)) for the following code.
>>
>> It is a "8 queens puzzle" solver written as an educational
example. As
>> compiled by both clang 3.5 and 3.7, it gave the correct answer, but
>> clang 3.5 generates code which runs 20% faster than 3.6/3.7.
>>
>> ##########################################
>> # clang 3.5 which comes with Xcode 6.1.1
>> ##########################################
>> $ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> $ time ./8 9 # 9 queens
>> 352 solutions
>> $ time ./8 10 # 10 queens
>> ./8 9 1.63s user 0.00s system 99% cpu 1.632 total
>> 724 solutions
>> ./8 10 45.11s user 0.01s system 99% cpu 45.121 total
>>
>> ##########################################
>> # clang 3.7 svn trunk
>> ##########################################
>> $ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> $ time ./8 9 # 9 queens
>> 352 solutions
>> ./8 9 2.07s user 0.00s system 99% cpu 2.078 total
>> $ time ./8 10 # 10 queens
>> 724 solutions
>> ./8 10 56.63s user 0.02s system 99% cpu 56.650 total
>>
>> The source code is below, I also attached the executable files as well
>> as the assembly code files for clang 3.5 and 3.6 by IDA.
>>
>> The performance is even worse when compiling as 32-bit code while
>> gcc-4.9.2 is not affected.
>>
>> ########## clang-3.5
>> $ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> $ time ./8 9
>> 352 solutions
>> ./8 9 1.95s user 0.00s system 99% cpu 1.950 total
>>
>> ########## clang-3.7
>> $ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> $ time ./8 9
>> 352 solutions
>> ./8 9 2.48s user 0.00s system 99% cpu 2.480 total
>>
>> ######### gcc-4.9.2
>> $ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> $ time ./8 9
>> 352 solutions
>> ./8 9 1.44s user 0.00s system 99% cpu 1.442 total
>>
>>
>> ```
>> #include <stdio.h>
>> #include <stdlib.h>
>>
>> static inline int validate(int* a, int d)
>> {
>> int i, j, x;
>> for (i = 0; i < d; ++i)
>> {
>> for (j = i+1, x = 1; j < d; ++j, ++x)
>> {
>> const int d = a[i] - a[j];
>> if (d == 0 || d == -x || d == x) return 0;
>> }
>> }
>> return 1;
>> }
>>
>> static inline int solve(int d)
>> {
>> int r = 0;
>> int* a = (int*) calloc(sizeof(int), d+1);
>> int p = d - 1;
>>
>> for (;;)
>> {
>> a[p]++;
>>
>> if (a[p] > d-1)
>> {
>> int bp = p - 1;
>> while (bp >= 0)
>> {
>> a[bp]++;
>> if (a[bp] <= d-1) break;
>> a[bp] = 0;
>> --bp;
>> }
>> if (bp < 0)
>> break;
>> a[p] = 0;
>> }
>> if (validate(a, d))
>> {
>> ++r;
>> }
>> }
>>
>> free(a);
>> return r;
>> }
>>
>> int main(int argc, char** argv)
>> {
>> if (argc != 2) return -1;
>> int r = solve((int) strtol(argv[1], NULL, 10));
>> printf("%d solutions\n", r);
>> }
>> ```
>>
>> clang 3.5's result:
>>
>> ```
>> public _main
>> _main proc near
>>
>> var_48 = qword ptr -48h
>> var_40 = qword ptr -40h
>> var_34 = dword ptr -34h
>>
>> push rbp
>> push r15
>> push r14
>> push r13
>> push r12
>> push rbx
>> sub rsp, 18h
>> mov ebx, 0FFFFFFFFh
>> cmp edi, 2
>> jnz loc_100000F29
>> mov rdi, [rsi+8] ; char *
>> xor r14d, r14d
>> xor esi, esi ; char **
>> mov edx, 0Ah ; int
>> call _strtol
>> mov r15, rax
>> shl rax, 20h
>> mov rsi, offset __mh_execute_header
>> add rsi, rax
>> sar rsi, 20h ; size_t
>> mov edi, 4 ; size_t
>> call _calloc
>> lea edx, [r15-1]
>> movsxd r8, edx
>> mov ecx, r15d
>> add ecx, 0FFFFFFFEh
>> js loc_100000DFA
>> test r15d, r15d
>> mov r11d, [rax+r8*4]
>> jle loc_100000EAE
>> mov ecx, r15d
>> add ecx, 0FFFFFFFEh
>> mov [rsp+48h+var_34], ecx
>> movsxd rcx, ecx
>> lea rcx, [rax+rcx*4]
>> mov [rsp+48h+var_40], rcx
>> lea rcx, [rax+4]
>> mov [rsp+48h+var_48], rcx
>> xor r14d, r14d
>> jmp short loc_100000D33
>> ;
---------------------------------------------------------------------------
>> align 10h
>>
>> loc_100000D30: ; CODE XREF: _main+129 j
>> ; _main+131 j ...
>> add r14d, ebx
>>
>> loc_100000D33: ; CODE XREF: _main+92 j
>> cmp r11d, edx
>> lea edi, [r11+1]
>> mov [rax+r8*4], edi
>> mov rcx, [rsp+48h+var_40]
>> mov esi, [rsp+48h+var_34]
>> mov r11d, edi
>> jl short loc_100000D84
>> nop dword ptr [rax+00h]
>>
>> loc_100000D50: ; CODE XREF: _main+DA j
>> mov edi, [rcx]
>> lea ebp, [rdi+1]
>> mov [rcx], ebp
>> cmp edi, edx
>> jl short loc_100000D71
>> mov dword ptr [rcx], 0
>> add rcx, 0FFFFFFFFFFFFFFFCh
>> test esi, esi
>> lea esi, [rsi-1]
>> jg short loc_100000D50
>> jmp loc_100000F0E
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000D71: ; CODE XREF: _main+C9 j
>> test esi, esi
>> js loc_100000F0E
>> mov dword ptr [rax+r8*4], 0
>> xor r11d, r11d
>>
>> loc_100000D84: ; CODE XREF: _main+BA j
>> cmp r15d, 1
>> mov esi, 0
>> mov r9, [rsp+48h+var_48]
>> mov r12d, 1
>> jle short loc_100000DF0
>>
>> loc_100000D99: ; CODE XREF: _main+15E j
>> mov r10d, [rax+rsi*4]
>> mov ecx, 0FFFFFFFFh
>> mov edi, 1
>> mov r13, r9
>> nop word ptr [rax+rax+00h]
>>
>> loc_100000DB0: ; CODE XREF: _main+14F j
>> xor ebx, ebx
>> mov ebp, r10d
>> sub ebp, [r13+0]
>> jz loc_100000D30
>> cmp ecx, ebp
>> jz loc_100000D30
>> cmp edi, ebp
>> jz loc_100000D30
>> add r13, 4
>> inc rdi
>> dec ecx
>> mov ebx, edi
>> add ebx, esi
>> cmp ebx, r15d
>> jl short loc_100000DB0
>> inc r12
>> add r9, 4
>> inc rsi
>> cmp r12d, r15d
>> jl short loc_100000D99
>>
>> loc_100000DF0: ; CODE XREF: _main+107 j
>> mov ebx, 1
>> jmp loc_100000D30
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000DFA: ; CODE XREF: _main+5E j
>> mov ecx, [rax+r8*4]
>> lea r9d, [rcx+1]
>> mov [rax+r8*4], r9d
>> cmp ecx, r8d
>> jge loc_100000F0E
>> lea r12, [rax+4]
>> xor r14d, r14d
>> db 2Eh
>> nop word ptr [rax+rax+00000000h]
>>
>> loc_100000E20: ; CODE XREF: _main+216 j
>> test r15d, r15d
>> setle cl
>> cmp r15d, 2
>> jl short loc_100000E90
>> test cl, cl
>> mov r13d, 0
>> mov r11, r12
>> mov r10d, 1
>> jnz short loc_100000E90
>>
>> loc_100000E3F: ; CODE XREF: _main+1F0 j
>> mov edi, [rax+r13*4]
>> mov edx, 0FFFFFFFFh
>> mov ecx, 1
>> mov rsi, r11
>>
>> loc_100000E50: ; CODE XREF: _main+1E1 j
>> xor ebx, ebx
>> mov ebp, edi
>> sub ebp, [rsi]
>> jz short loc_100000E95
>> cmp edx, ebp
>> jz short loc_100000E95
>> cmp ecx, ebp
>> jz short loc_100000E95
>> add rsi, 4
>> inc rcx
>> dec edx
>> mov ebx, ecx
>> add ebx, r13d
>> cmp ebx, r15d
>> jl short loc_100000E50
>> inc r10
>> add r11, 4
>> inc r13
>> cmp r10d, r15d
>> jl short loc_100000E3F
>> db 66h, 66h, 66h, 66h, 2Eh
>> nop word ptr [rax+rax+00000000h]
>>
>> loc_100000E90: ; CODE XREF: _main+19A j
>> ; _main+1AD j
>> mov ebx, 1
>>
>> loc_100000E95: ; CODE XREF: _main+1C6 j
>> ; _main+1CA j ...
>> add r14d, ebx
>> cmp r9d, r8d
>> lea ecx, [r9+1]
>> mov [rax+r8*4], ecx
>> mov r9d, ecx
>> jl loc_100000E20
>> jmp short loc_100000F0E
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000EAE: ; CODE XREF: _main+6B j
>> add r15d, 0FFFFFFFEh
>> movsxd rcx, r15d
>> lea rcx, [rax+rcx*4]
>> xor r14d, r14d
>> jmp short loc_100000EC6
>> ;
---------------------------------------------------------------------------
>> align 20h
>>
>> loc_100000EC0: ; CODE XREF: _main+247 j
>> ; _main+27C j
>> inc r14d
>> mov r11d, ebp
>>
>> loc_100000EC6: ; CODE XREF: _main+22C j
>> lea ebp, [r11+1]
>> mov [rax+r8*4], ebp
>> cmp r11d, r8d
>> mov rsi, rcx
>> mov edi, r15d
>> jl short loc_100000EC0
>> nop dword ptr [rax+00000000h]
>>
>> loc_100000EE0: ; CODE XREF: _main+26A j
>> mov ebp, [rsi]
>> lea ebx, [rbp+1]
>> mov [rsi], ebx
>> cmp ebp, edx
>> jl short loc_100000EFE
>> mov dword ptr [rsi], 0
>> add rsi, 0FFFFFFFFFFFFFFFCh
>> test edi, edi
>> lea edi, [rdi-1]
>> jg short loc_100000EE0
>> jmp short loc_100000F0E
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000EFE: ; CODE XREF: _main+259 j
>> test edi, edi
>> js short loc_100000F0E
>> mov dword ptr [rax+r8*4], 0
>> xor ebp, ebp
>> jmp short loc_100000EC0
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000F0E: ; CODE XREF: _main+DC j
>> ; _main+E3 j ...
>> mov rdi, rax ; void *
>> call _free
>> lea rdi, aDSolutions ; "%d solutions\n"
>> xor ebx, ebx
>> xor eax, eax
>> mov esi, r14d
>> call _printf
>>
>> loc_100000F29: ; CODE XREF: _main+16 j
>> mov eax, ebx
>> add rsp, 18h
>> pop rbx
>> pop r12
>> pop r13
>> pop r14
>> pop r15
>> pop rbp
>> retn
>> _main endp
>> ```
>>
>> clang 3.6's result:
>>
>> ```
>> public _main
>> _main proc near
>>
>> var_60 = qword ptr -60h
>> var_58 = qword ptr -58h
>> var_50 = qword ptr -50h
>> var_48 = qword ptr -48h
>> var_40 = qword ptr -40h
>> var_38 = qword ptr -38h
>>
>> push rbp
>> push r15
>> push r14
>> push r13
>> push r12
>> push rbx
>> sub rsp, 38h
>> mov ebx, 0FFFFFFFFh
>> cmp edi, 2
>> jnz loc_100000F23
>> mov rbx, offset __mh_execute_header
>> mov rdi, [rsi+8] ; char *
>> xor r13d, r13d
>> xor esi, esi ; char **
>> mov edx, 0Ah ; int
>> call _strtol
>> mov r14, rax
>> shl rax, 20h
>> mov [rsp+68h+var_38], rax
>> lea rsi, [rax+rbx]
>> sar rsi, 20h ; size_t
>> mov edi, 4 ; size_t
>> call _calloc
>> lea r11d, [r14-1]
>> movsxd r12, r11d
>> mov [rsp+68h+var_40], r12
>> movsxd rcx, r14d
>> mov [rsp+68h+var_50], rcx
>> add ecx, 0FFFFFFFEh
>> js loc_100000E1A
>> mov ecx, r14d
>> add ecx, 0FFFFFFFEh
>> movsxd rcx, ecx
>> inc rcx
>> mov [rsp+68h+var_58], rcx
>> mov rcx, rax
>> add rcx, 4
>> mov [rsp+68h+var_60], rcx
>> xor ebp, ebp
>> jmp short loc_100000D17
>> ;
---------------------------------------------------------------------------
>> align 10h
>>
>> loc_100000D10: ; CODE XREF: _main+15B j
>> ; _main+163 j ...
>> mov rbp, [rsp+68h+var_48]
>> add ebp, edi
>>
>> loc_100000D17: ; CODE XREF: _main+93 j
>> cmp r13d, r11d
>> lea edx, [r13+1]
>> mov [rax+r12*4], edx
>> mov rcx, [rsp+68h+var_58]
>> mov r13d, edx
>> jl short loc_100000D6B
>> nop dword ptr [rax+00h]
>>
>> loc_100000D30: ; CODE XREF: _main+DE j
>> mov edx, [rax+rcx*4-4]
>> lea esi, [rdx+1]
>> mov [rax+rcx*4-4], esi
>> cmp edx, r11d
>> jl short loc_100000D60
>> mov dword ptr [rax+rcx*4-4], 0
>> dec rcx
>> test rcx, rcx
>> jg short loc_100000D30
>> jmp loc_100000F09
>> ;
---------------------------------------------------------------------------
>> align 20h
>>
>> loc_100000D60: ; CODE XREF: _main+CE j
>> mov dword ptr [rax+r12*4], 0
>> xor r13d, r13d
>>
>> loc_100000D6B: ; CODE XREF: _main+BA j
>> mov [rsp+68h+var_48], rbp
>> test r14d, r14d
>> setle cl
>> mov rdx, offset __mh_execute_header
>> lea rdx, [rdx+1]
>> cmp [rsp+68h+var_38], rdx
>> jl loc_100000E10
>> test cl, cl
>> mov edx, 0
>> mov r10, [rsp+68h+var_60]
>> mov r9d, 1
>> jnz short loc_100000E10
>>
>> loc_100000DA3: ; CODE XREF: _main+195 j
>> mov esi, [rax+rdx*4]
>> mov r15d, 0FFFFFFFFh
>> mov r8d, 1
>> mov rcx, r10
>> db 66h, 66h, 2Eh
>> nop dword ptr [rax+rax+00000000h]
>>
>> loc_100000DC0: ; CODE XREF: _main+184 j
>> mov ebx, [rcx]
>> mov ebp, esi
>> sub ebp, ebx
>> xor edi, edi
>> cmp r8d, ebp
>> jz loc_100000D10
>> cmp esi, ebx
>> jz loc_100000D10
>> cmp r15d, ebp
>> jz loc_100000D10
>> add rcx, 4
>> inc r8
>> dec r15d
>> mov edi, r8d
>> add edi, edx
>> cmp edi, r14d
>> jl short loc_100000DC0
>> inc r9
>> add r10, 4
>> inc rdx
>> cmp r9, [rsp+68h+var_50]
>> jl short loc_100000DA3
>> nop word ptr [rax+rax+00000000h]
>>
>> loc_100000E10: ; CODE XREF: _main+119 j
>> ; _main+131 j
>> mov edi, 1
>> jmp loc_100000D10
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000E1A: ; CODE XREF: _main+6E j
>> test r14d, r14d
>> jle loc_100000F00
>> mov dword ptr [rax+r12*4], 1
>> xor ebp, ebp
>> cmp r14d, 2
>> jl loc_100000F09
>> mov rcx, rax
>> add rcx, 4
>> mov [rsp+68h+var_48], rcx
>> xor ebp, ebp
>> mov r15d, 1
>> nop dword ptr [rax+rax+00h]
>>
>> loc_100000E50: ; CODE XREF: _main+288 j
>> mov rbx, rbp
>> mov rcx, offset __mh_execute_header
>> cmp [rsp+68h+var_38], rcx
>> mov edx, 0
>> mov r13, [rsp+68h+var_48]
>> mov r8d, 1
>> mov r9d, 1
>> jle short loc_100000EE0
>>
>> loc_100000E7A: ; CODE XREF: _main+25A j
>> mov r12d, [rax+rdx*4]
>> mov edi, 0FFFFFFFFh
>> mov ecx, 1
>> mov rsi, r13
>> nop dword ptr [rax+rax+00h]
>>
>> loc_100000E90: ; CODE XREF: _main+249 j
>> mov r10d, [rsi]
>> mov ebp, r12d
>> sub ebp, r10d
>> xor r9d, r9d
>> cmp ecx, ebp
>> jz short loc_100000EE0
>> cmp r12d, r10d
>> jz short loc_100000EE0
>> cmp edi, ebp
>> jz short loc_100000EE0
>> add rsi, 4
>> inc rcx
>> dec edi
>> mov ebp, ecx
>> add ebp, edx
>> cmp ebp, r14d
>> jl short loc_100000E90
>> inc r8
>> add r13, 4
>> inc rdx
>> cmp r8, [rsp+68h+var_50]
>> jl short loc_100000E7A
>> mov r9d, 1
>> db 66h, 66h, 66h, 66h, 2Eh
>> nop word ptr [rax+rax+00000000h]
>>
>> loc_100000EE0: ; CODE XREF: _main+208 j
>> ; _main+22E j ...
>> mov rbp, rbx
>> add ebp, r9d
>> cmp r15d, r11d
>> lea ecx, [r15+1]
>> mov rdx, [rsp+68h+var_40]
>> mov [rax+rdx*4], ecx
>> mov r15d, ecx
>> jl loc_100000E50
>> jmp short loc_100000F09
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000F00: ; CODE XREF: _main+1AD j
>> xor ebp, ebp
>> test r11d, r11d
>> cmovns ebp, r11d
>>
>> loc_100000F09: ; CODE XREF: _main+E0 j
>> ; _main+1C1 j ...
>> mov rdi, rax ; void *
>> call _free
>> lea rdi, aDSolutions ; "%d solutions\n"
>> xor ebx, ebx
>> xor eax, eax
>> mov esi, ebp
>> call _printf
>>
>> loc_100000F23: ; CODE XREF: _main+16 j
>> mov eax, ebx
>> add rsp, 38h
>> pop rbx
>> pop r12
>> pop r13
>> pop r14
>> pop r15
>> pop rbp
>> retn
>> _main endp
>> ```
>>
>> gcc-4.9.2's result:
>> ```
>>
>> _main proc near
>>
>> var_48 = qword ptr -48h
>> var_40 = dword ptr -40h
>> var_3C = dword ptr -3Ch
>>
>> cmp edi, 2
>> jz short loc_100000D69
>> or eax, 0FFFFFFFFh
>> retn
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000D69: ; CODE XREF: _main+3 j
>> push r15
>> mov edx, 0Ah ; int
>> push r14
>> push r13
>> push r12
>> push rbp
>> push rbx
>> sub rsp, 18h
>> mov rdi, [rsi+8] ; char *
>> xor esi, esi ; char **
>> call _strtol
>> mov edi, 4 ; size_t
>> lea esi, [rax+1]
>> mov r14, rax
>> mov ebx, eax
>> lea r15d, [r14-2]
>> movsxd rsi, esi ; size_t
>> call _calloc
>> mov [rsp+48h+var_3C], 0
>> mov rdi, rax ; void *
>> lea eax, [r14-1]
>> cdqe
>> lea r13, [rdi+rax*4]
>> movsxd rax, r15d
>> mov ebp, [r13+0]
>> shl rax, 2
>> lea r12, [rdi+rax]
>> lea rax, [rdi+rax-4]
>> mov [rsp+48h+var_48], rax
>> mov eax, r14d
>> lea r14d, [r14+1]
>> nop word ptr [rax+rax+00h]
>> nop word ptr [rax+rax+00h]
>>
>> loc_100000DE0: ; CODE XREF: _main+12B j
>> ; _main+155 j ...
>> add ebp, 1
>> cmp ebx, ebp
>> mov [r13+0], ebp
>> jg short loc_100000E62
>> test r15d, r15d
>> js short loc_100000E33
>> mov ecx, [r12]
>> lea edx, [rcx+1]
>> cmp ebx, edx
>> mov [r12], edx
>> jg short loc_100000E58
>> mov r8, r12
>> mov rcx, [rsp+48h+var_48]
>> mov esi, r15d
>> jmp short loc_100000E24
>> ;
---------------------------------------------------------------------------
>> align 10h
>>
>> loc_100000E10: ; CODE XREF: _main+D1 j
>> mov edx, [rcx]
>> sub r8, 4
>> sub rcx, 4
>> add edx, 1
>> mov [rcx+4], edx
>> cmp ebx, edx
>> jg short loc_100000E58
>>
>> loc_100000E24: ; CODE XREF: _main+A9 j
>> sub esi, 1
>> mov dword ptr [r8], 0
>> cmp esi, 0FFFFFFFFh
>> jnz short loc_100000E10
>>
>> loc_100000E33: ; CODE XREF: _main+8E j
>> call _free
>> mov esi, [rsp+48h+var_3C]
>> add rsp, 18h
>> xor eax, eax
>> pop rbx
>> lea rdi, aDSolutions ; "%d solutions\n"
>> pop rbp
>> pop r12
>> pop r13
>> pop r14
>> pop r15
>> jmp _printf
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000E58: ; CODE XREF: _main+9D j
>> ; _main+C2 j
>> mov dword ptr [r13+0], 0
>> xor ebp, ebp
>>
>> loc_100000E62: ; CODE XREF: _main+89 j
>> test ebx, ebx
>> jle loc_100000EE6
>> lea r11, [rdi+8]
>> xor r10d, r10d
>>
>> loc_100000E71: ; CODE XREF: _main+184 j
>> add r10d, 1
>> cmp r10d, eax
>> jz short loc_100000EE6
>> mov r8d, [r11-8]
>> mov edx, r8d
>> sub edx, [r11-4]
>> add edx, 1
>> cmp edx, 2
>> jbe loc_100000DE0
>> mov r9d, r14d
>> mov rcx, r11
>> mov edx, 1
>> mov [rsp+48h+var_40], r10d
>> sub r9d, r10d
>> jmp short loc_100000ED3
>> ;
---------------------------------------------------------------------------
>> align 10h
>>
>> loc_100000EB0: ; CODE XREF: _main+179 j
>> mov esi, r8d
>> sub esi, [rcx]
>> jz loc_100000DE0
>> mov r10d, esi
>> add rcx, 4
>> add r10d, edx
>> jz loc_100000DE0
>> cmp esi, edx
>> jz loc_100000DE0
>>
>> loc_100000ED3: ; CODE XREF: _main+144 j
>> add edx, 1
>> cmp edx, r9d
>> jnz short loc_100000EB0
>> mov r10d, [rsp+48h+var_40]
>> add r11, 4
>> jmp short loc_100000E71
>> ;
---------------------------------------------------------------------------
>>
>> loc_100000EE6: ; CODE XREF: _main+104 j
>> ; _main+118 j
>> add [rsp+48h+var_3C], 1
>> jmp loc_100000DE0
>> _main endp
>> ```
>>
>> MSVC 10.0's result:
>>
>> ```
>>
>> _main proc near ; CODE XREF:
___tmainCRTStartup+106 p
>>
>> var_80 = dword ptr -80h
>> var_7C = dword ptr -7Ch
>> var_78 = dword ptr -78h
>> var_74 = dword ptr -74h
>> var_70 = dword ptr -70h
>> var_6C = dword ptr -6Ch
>> var_68 = dword ptr -68h
>> var_64 = dword ptr -64h
>> var_60 = dword ptr -60h
>> var_5C = dword ptr -5Ch
>> argc = dword ptr 8
>> argv = dword ptr 0Ch
>> envp = dword ptr 10h
>>
>> push ebp
>> mov ebp, esp
>> and esp, 0FFFFFF80h
>> push esi
>> push edi
>> push ebx
>> sub esp, 74h
>> push 3
>> call sub_4080F0
>> add esp, 4
>> stmxcsr [esp+80h+var_80]
>> or [esp+80h+var_80], 8000h
>> ldmxcsr [esp+80h+var_80]
>> cmp [ebp+argc], 2
>> jz short loc_40103A
>> mov eax, 0FFFFFFFFh
>> add esp, 74h
>> pop ebx
>> pop edi
>> pop esi
>> mov esp, ebp
>> pop ebp
>> retn
>> ;
---------------------------------------------------------------------------
>>
>> loc_40103A: ; CODE XREF: _main+29 j
>> call ds:GetTickCount
>> mov esi, eax
>> mov eax, [ebp+argv]
>> push dword ptr [eax+4] ; char *
>> call _atoi
>> mov edi, eax
>> lea eax, [edi+1]
>> push eax ; size_t
>> push 4 ; size_t
>> call _calloc
>> add esp, 0Ch
>> mov ecx, [eax+edi*4-4]
>> lea edx, [edi-1]
>> mov [esp+80h+var_6C], ecx
>> xor ebx, ebx
>> mov [esp+80h+var_7C], ebx
>> lea ecx, [eax+edi*4]
>> mov [esp+80h+var_74], ecx
>> lea ecx, [edi-2]
>> mov [esp+80h+var_70], ecx
>> mov [esp+80h+var_60], edx
>> mov [esp+80h+var_80], esi
>> mov ecx, [esp+80h+var_6C]
>>
>> loc_401087: ; CODE XREF: _main+142 j
>> ; _main+193 j
>> mov edx, [esp+80h+var_60]
>> inc ecx
>> mov [eax+edi*4-4], ecx
>> cmp edi, [eax+edx*4]
>> jg short loc_4010DC
>> mov esi, [esp+80h+var_70]
>> test esi, esi
>> js short loc_4010CE
>> xor edx, edx
>> mov [esp+80h+var_78], eax
>> xor ebx, ebx
>> mov eax, [esp+80h+var_74]
>>
>> loc_4010A9: ; CODE XREF: _main+C8 j
>> mov ecx, [eax+ebx*4-8]
>> inc ecx
>> cmp ecx, edi
>> jl loc_40117A
>> inc edx
>> lea esi, [ebx+edi-3]
>> mov dword ptr [eax+ebx*4-8], 0
>> dec ebx
>> cmp edx, [esp+80h+var_60]
>> jb short loc_4010A9
>> mov eax, [esp+80h+var_78]
>>
>> loc_4010CE: ; CODE XREF: _main+9B j
>> ; _main+186 j
>> test esi, esi
>> jl short loc_401147
>> mov dword ptr [eax+edi*4-4], 0
>> xor ecx, ecx
>>
>> loc_4010DC: ; CODE XREF: _main+93 j
>> test edi, edi
>> jle short loc_40113E
>> mov [esp+80h+var_6C], ecx
>> xor edx, edx
>> mov [esp+80h+var_5C], edi
>>
>> loc_4010EA: ; CODE XREF: _main+132 j
>> lea ecx, [edx+1]
>> mov ebx, ecx
>> mov esi, ebx
>> cmp ecx, [esp+80h+var_5C]
>> jge short loc_401130
>> mov edx, [eax+edx*4]
>> mov edi, 1
>> mov [esp+80h+var_64], esi
>> mov [esp+80h+var_68], ecx
>>
>> loc_401107: ; CODE XREF: _main+122 j
>> mov esi, [eax+ebx*4]
>> cmp edx, esi
>> jz short loc_40118B
>> sub esi, edx
>> mov ecx, esi
>> neg ecx
>> cmp edi, ecx
>> jz short loc_40118B
>> cmp esi, edi
>> jz short loc_40118B
>> inc ebx
>> inc edi
>> cmp ebx, [esp+80h+var_5C]
>> jl short loc_401107
>> mov ecx, [esp+80h+var_68]
>> mov esi, [esp+80h+var_64]
>> cmp ecx, [esp+80h+var_5C]
>>
>> loc_401130: ; CODE XREF: _main+F5 j
>> mov edx, esi
>> jl short loc_4010EA
>> xchg ax, ax
>> mov ecx, [esp+80h+var_6C]
>> mov edi, [esp+80h+var_5C]
>>
>> loc_40113E: ; CODE XREF: _main+DE j
>> inc [esp+80h+var_7C]
>> jmp loc_401087
>> ;
---------------------------------------------------------------------------
>>
>> loc_401147: ; CODE XREF: _main+D0 j
>> mov ebx, [esp+80h+var_7C]
>> mov esi, [esp+80h+var_80]
>> push eax ; void *
>> call _free
>> add esp, 4
>> call ds:GetTickCount
>> sub eax, esi
>> push eax
>> push ebx
>> push offset aDSolutionsInDM ; "%d solutions in
%d msecs.\n"
>> call _printf
>> xor eax, eax
>> add esp, 80h
>> pop ebx
>> pop edi
>> pop esi
>> mov esp, ebp
>> pop ebp
>> retn
>> ;
---------------------------------------------------------------------------
>>
>> loc_40117A: ; CODE XREF: _main+B0 j
>> mov edx, [esp+80h+var_74]
>> mov eax, [esp+80h+var_78]
>> mov [edx+ebx*4-8], ecx
>> jmp loc_4010CE
>> ;
---------------------------------------------------------------------------
>>
>> loc_40118B: ; CODE XREF: _main+10C j
>> ; _main+116 j ...
>> mov ecx, [esp+80h+var_6C]
>> mov edi, [esp+80h+var_5C]
>> jmp loc_401087
>> _main endp
>> ```
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Seemingly Similar Threads
- [LLVMdev] trunk's optimizer generates slower code than 3.5
- [LLVMdev] trunk's optimizer generates slower code than 3.5
- [LLVMdev] bug in X86 disasm code?
- Tail call optimization is getting affected due to local function related optimization with IPRA
- Tail call optimization is getting affected due to local function related optimization with IPRA