thr3ads.net - llvm dev - [llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more... [Dec 2018]

If this information is useful, please help other people find it:
Share via:

Stefan Kanthak via llvm-dev

2018-Dec-01 17:29 UTC

[llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more...

Compile the following functions with "-O3 -target i386-win32"
(see <https://godbolt.org/z/exmjWY>):

__int64 __fastcall div(__int64 foo, __int64 bar)
{
    return foo / bar;
}


On the left the generated code; on the right the expected,
properly optimised code:

    push  dword ptr [esp + 16]        |
    push  dword ptr [esp + 16]        |
    push  dword ptr [esp + 16]        |
    push  dword ptr [esp + 16]        |
    call  __alldiv                    |     jmp  __alldiv
    ret   16                          |


__int64 __fastcall mod(__int64 foo, __int64 bar)
{
    return foo % bar;
}

    push  dword ptr [esp + 16]        |
    push  dword ptr [esp + 16]        |
    push  dword ptr [esp + 16]        |
    push  dword ptr [esp + 16]        |
    call  __allrem                    |     jmp  __allrem
    ret   16                          |


__int64 __fastcall mul(__int64 foo, __int64 bar)
{
    return foo * bar;
}

    push  esi                         |     mov   ecx, dword ptr [esp + 16]
    mov   ecx, dword ptr [esp + 16]   |     mov   edx, dword ptr [esp + 12]
    mov   esi, dword ptr [esp + 8]    |     imul  edx, dword ptr [esp + 8]
    mov   eax, ecx                    |     mov   eax, dword ptr [esp + 4]
    imul  ecx, dword ptr [esp + 12]   |     imul  ecx, eax
    mul   esi                         |     add   ecx, edx
    imul  esi, dword ptr [esp + 20]   |     mul   dword ptr [esp + 12]
    add   edx, ecx                    |     add   edx, ecx
    add   edx, esi                    |     ret   16
    pop   esi                         |
    ret   16                          |

Craig Topper via llvm-dev

2018-Dec-01 19:15 UTC

head link

[llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more...

For the multiply case, your improved code requires duplicating a load. Sure
it's safe in this case because there are no stores and the memory isn't
volatile.  But the register allocator would have to analyze the code to
prove that it's safe to duplicate.

~Craig


On Sat, Dec 1, 2018 at 9:38 AM Stefan Kanthak via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Compile the following functions with "-O3 -target i386-win32"
> (see <https://godbolt.org/z/exmjWY>):
>
> __int64 __fastcall div(__int64 foo, __int64 bar)
> {
>     return foo / bar;
> }
>
>
> On the left the generated code; on the right the expected,
> properly optimised code:
>
>     push  dword ptr [esp + 16]        |
>     push  dword ptr [esp + 16]        |
>     push  dword ptr [esp + 16]        |
>     push  dword ptr [esp + 16]        |
>     call  __alldiv                    |     jmp  __alldiv
>     ret   16                          |
>
>
> __int64 __fastcall mod(__int64 foo, __int64 bar)
> {
>     return foo % bar;
> }
>
>     push  dword ptr [esp + 16]        |
>     push  dword ptr [esp + 16]        |
>     push  dword ptr [esp + 16]        |
>     push  dword ptr [esp + 16]        |
>     call  __allrem                    |     jmp  __allrem
>     ret   16                          |
>
>
> __int64 __fastcall mul(__int64 foo, __int64 bar)
> {
>     return foo * bar;
> }
>
>     push  esi                         |     mov   ecx, dword ptr [esp + 16]
>     mov   ecx, dword ptr [esp + 16]   |     mov   edx, dword ptr [esp + 12]
>     mov   esi, dword ptr [esp + 8]    |     imul  edx, dword ptr [esp + 8]
>     mov   eax, ecx                    |     mov   eax, dword ptr [esp + 4]
>     imul  ecx, dword ptr [esp + 12]   |     imul  ecx, eax
>     mul   esi                         |     add   ecx, edx
>     imul  esi, dword ptr [esp + 20]   |     mul   dword ptr [esp + 12]
>     add   edx, ecx                    |     add   edx, ecx
>     add   edx, esi                    |     ret   16
>     pop   esi                         |
>     ret   16                          |
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20181201/b2aa08a7/attachment.html>

Stefan Kanthak via llvm-dev

2018-Dec-01 20:15 UTC

head link

[llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more...

"Craig Topper" <craig.topper at gmail.com> wrote:
> For the multiply case, your improved code requires duplicating a load.
... while saving another load and a store, and it doesn't clobber ESI.
> Sure it's safe in this case because there are no stores and the memory
isn't
> volatile.
Right. Especially the latter can safely be assumed true for function
arguments on the stack.
> But the register allocator would have to analyze the code to
> prove that it's safe to duplicate.
This sentence implies for me that it might be a design goal to
never load a function argument or variable twice.
If yes, that's OK; if not, the register allocator should prove.

regards
Stefan
> On Sat, Dec 1, 2018 at 9:38 AM Stefan Kanthak via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
> 
>> Compile the following functions with "-O3 -target i386-win32"
>> (see <https://godbolt.org/z/exmjWY>):
>>
>> __int64 __fastcall div(__int64 foo, __int64 bar)
>> {
>>     return foo / bar;
>> }
>>
>>
>> On the left the generated code; on the right the expected,
>> properly optimised code:
>>
>>     push  dword ptr [esp + 16]        |
>>     push  dword ptr [esp + 16]        |
>>     push  dword ptr [esp + 16]        |
>>     push  dword ptr [esp + 16]        |
>>     call  __alldiv                    |     jmp  __alldiv
>>     ret   16                          |
>>
>>
>> __int64 __fastcall mod(__int64 foo, __int64 bar)
>> {
>>     return foo % bar;
>> }
>>
>>     push  dword ptr [esp + 16]        |
>>     push  dword ptr [esp + 16]        |
>>     push  dword ptr [esp + 16]        |
>>     push  dword ptr [esp + 16]        |
>>     call  __allrem                    |     jmp  __allrem
>>     ret   16                          |
>>
>>
>> __int64 __fastcall mul(__int64 foo, __int64 bar)
>> {
>>     return foo * bar;
>> }
>>
>>     push  esi                         |     mov   ecx, dword ptr [esp +
16]
>>     mov   ecx, dword ptr [esp + 16]   |     mov   edx, dword ptr [esp +
12]
>>     mov   esi, dword ptr [esp + 8]    |     imul  edx, dword ptr [esp +
8]
>>     mov   eax, ecx                    |     mov   eax, dword ptr [esp +
4]
>>     imul  ecx, dword ptr [esp + 12]   |     imul  ecx, eax
>>     mul   esi                         |     add   ecx, edx
>>     imul  esi, dword ptr [esp + 20]   |     mul   dword ptr [esp + 12]
>>     add   edx, ecx                    |     add   edx, ecx
>>     add   edx, esi                    |     ret   16
>>     pop   esi                         |
>>     ret   16                          |
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>

Possibly Parallel Threads

Search for more reasonably related threads

llvm dev - Dec 2018 - Where's the optimiser gone? (part 5.c): missed tail calls, and more...

[llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more...

[llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more...

[llvm-dev] Where's the optimiser gone? (part 5.c): missed tail calls, and more...

Possibly Parallel Threads