thr3ads.net - llvm dev - [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32) [Nov 2018]

If this information is useful, please help other people find it:
Share via:

Stefan Kanthak via llvm-dev

2018-Nov-27 23:37 UTC

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

"Sanjay Patel" <spatel at rotateright.com> wrote:
> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }
To document this for x86 too: rewrite the function slightly

unsigned int foo(unsigned int crc, unsigned int poly) {
    if (crc & 0x80000000)
        crc <<= 1, crc ^= poly;
    else
        crc <<= 1;
    return crc;
}

unsigned int bar(unsigned int crc, unsigned int poly) {
    if (crc & 1)
        crc >>= 1, crc ^= poly;
    else
        crc >>= 1;
    return crc;
}

and you get the perfect code for the left-shifting case!

foo: # @foo
    lea eax, [rdi + rdi]
    sar edi, 31
    and edi, esi
    xor eax, edi
    ret

The right-shifting case leaves but still room for improvement!

bar: # @bar                | bar: # @bar
    mov eax, edi           |
    and eax, 1             |
    neg eax                |
    shr edi                |     shr edi
                           |     sbb eax, eax
    and eax, esi           |     and eax, esi
    xor eax, edi           |     xor eax, edi
    ret                    |     ret

See <https://godbolt.org/z/aPKweG>

regards
Stefan Kanthak
> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
> 
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
> 
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
> 
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for
this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for
example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code
generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the
"inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code
generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6,
r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of
EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7,
and
>>         mov    edx, eax                | #  neither r8 nor rcx
clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of
EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same
non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a
total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>

Sanjay Patel via llvm-dev

2018-Nov-28 15:10 UTC

head link

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Thanks for reporting this and other perf opportunities. As I mentioned
before, if you could file bug reports for these, that's probably the only
way they're ever going to get fixed (unless you're planning to fix them
yourself). It's not an ideal situation, but that's how this project
works
in my experience.

Someone filed 1 of your examples on your behalf here:
https://bugs.llvm.org/show_bug.cgi?id=39793

Please do consider following that model for your other problems. Without
that kind of report, your code examples are likely to be forgotten.

Finally, sending mail to llvm-bugs is practically useless. I don't know of
anyone that is tracking mails to that list rather than actual bug reports.

On Tue, Nov 27, 2018 at 4:37 PM Stefan Kanthak <stefan.kanthak at
nexgo.de>
wrote:
> "Sanjay Patel" <spatel at rotateright.com> wrote:
>
> > IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases
like
> > this:
> > unsigned int foo(unsigned int crc) {
> >    if (crc & 0x80000000)
> >      crc <<= 1, crc ^= 0xEDB88320;
> >    else
> >      crc <<= 1;
> >    return crc;
> > }
>
> To document this for x86 too: rewrite the function slightly
>
> unsigned int foo(unsigned int crc, unsigned int poly) {
>     if (crc & 0x80000000)
>         crc <<= 1, crc ^= poly;
>     else
>         crc <<= 1;
>     return crc;
> }
>
> unsigned int bar(unsigned int crc, unsigned int poly) {
>     if (crc & 1)
>         crc >>= 1, crc ^= poly;
>     else
>         crc >>= 1;
>     return crc;
> }
>
> and you get the perfect code for the left-shifting case!
>
> foo: # @foo
>     lea eax, [rdi + rdi]
>     sar edi, 31
>     and edi, esi
>     xor eax, edi
>     ret
>
> The right-shifting case leaves but still room for improvement!
>
> bar: # @bar                | bar: # @bar
>     mov eax, edi           |
>     and eax, 1             |
>     neg eax                |
>     shr edi                |     shr edi
>                            |     sbb eax, eax
>     and eax, esi           |     and eax, esi
>     xor eax, edi           |     xor eax, edi
>     ret                    |     ret
>
> See <https://godbolt.org/z/aPKweG>
>
> regards
> Stefan Kanthak
>
> > Which is this in LLVM IR:
> > define i32 @foo(i32 %x) {
> >  %t2 = icmp slt i32 %x, 0
> >  %t3 = shl i32 %x, 1
> >  %t4 = xor i32 %t3, -306674912
> >  %t5 = select i1 %t2, i32 %t4, i32 %t3
> >  ret i32 %t5
> > }
> >
> > Please a file a bug report for the x86 backend (including performance
> > numbers if you have that data).
> >
> > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> > llvm-dev at lists.llvm.org> wrote:
> >
> >> Hi @ll,
> >>
> >> while clang/LLVM recognizes common bit-twiddling
idioms/expressions
> >> like
> >>
> >> unsigned int rotate(unsigned int x, unsigned int n)
> >> {
> >>     return (x << n) | (x >> (32 - n));
> >> }
> >>
> >> and typically generates "rotate" machine instructions
for this
> >> expression, it fails to recognize other also common bit-twiddling
> >> idioms/expressions.
> >>
> >> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
> >> (see <https://tools.ietf.org/html/rfc1952#section-8> for
example):
> >>
> >> unsigned int crc32be(unsigned char const *octets, unsigned int
count)
> >> {
> >>     unsigned int crc = 0L;
> >>     unsigned int i;
> >>
> >>     while (count--) {
> >>         crc ^= *octets++ << 24;
> >>         for (i = 8; i > 0; i--)
> >>             if (crc & 0x80000000L)             // the code
generated
> >>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel
x86 from
> >>             else                               // these 4 lines is
> >>                 crc <<= 1;                     // rather
poor!
> >>     }
> >>     return crc;
> >> }
> >>
> >> The same function for "little endian" byte order, using
the "inverse"
> >> or "mirrored" polynom:
> >>
> >> unsigned int crc32le(unsigned char const *octets, unsigned int
count)
> >> {
> >>     unsigned int crc = ~0L;
> >>     unsigned int i;
> >>
> >>     while (count--) {
> >>         crc ^= *octets++;
> >>         for (i = 8; i > 0; i--)
> >>             if (crc & 1L)                      // the code
generated
> >>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel
x86 from
> >>             else                               // these 4 lines is
> >>                 crc >>= 1;                     // rather
poor!
> >>     }
> >>     return ~crc;
> >> }
> >>
> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <
> https://godbolt.org/z/zeExHm>
> >> (-O2)
> >>
> >> crc32be: # @crc32be
> >>         xor    eax, eax
> >>         test   esi, esi
> >>         jne    .LBB0_2
> >>         jmp    .LBB0_5
> >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
> >>         add    rdi, 1
> >>         test   esi, esi
> >>         je    .LBB0_5
> >> .LBB0_2: # =>This Loop Header: Depth=1
> >>         add    esi, -1
> >>         movzx  edx, byte ptr [rdi]
> >>         shl    edx, 24
> >>         xor    edx, eax
> >>         mov    ecx, -8
> >>         mov    eax, edx
> >> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead
of 6,
> r8
> >> not clobbered!
> >>         lea    r8d, [rax + rax]        |     add   eax, eax
> >>         mov    edx, r8d                | # CF is set from the MSB
of EAX
> >>         xor    edx, -306674912         |     sbb   edx, edx
> >>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF
set,
> >> else 0
> >>         mov    eax, edx                |     and   edx, -306674912
> >>         cmovns eax, r8d                |     xor   eax, edx
> >>         add    ecx, 1
> >>         jne    .LBB0_3
> >>         jmp    .LBB0_4
> >> .LBB0_5:
> >>         ret
> >> crc32le: # @crc32le
> >>         test   esi, esi
> >>         je    .LBB1_1
> >>         mov    eax, -1
> >> .LBB1_4: # =>This Loop Header: Depth=1
> >>         add    esi, -1
> >>         movzx  ecx, byte ptr [rdi]
> >>         xor    eax, ecx
> >>         mov    r8d, -8
> >> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead
of 7,
> and
> >>         mov    edx, eax                | #  neither r8 nor rcx
> clobbered!
> >>         shr    edx                     |     shr   eax, 1
> >>         mov    ecx, edx                | # CF is set from the LSB
of EAX
> >>         xor    ecx, 79764919           |     sbb   edx, edx
> >>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF
set,
> >> else 0
> >>         mov    eax, ecx                |     and   edx, 79764919
> >>         cmove  eax, edx                |     xor   eax, edx
> >>         add    r8d, 1
> >>         jne    .LBB1_5
> >>         add    rdi, 1
> >>         test   esi, esi
> >>         jne    .LBB1_4
> >>         not    eax
> >>         ret
> >> .LBB1_1:
> >>         xor    eax, eax
> >>         ret
> >>
> >> JFTR: with -O2, the inner loop gets unrolled, using the same
non-optimal
> >>       code sequence with 6 and 7 instructions; this accounts for a
total
> >>       of 16 and 24 superfluous instructions respectively.
> >> _______________________________________________
> >> LLVM Developers mailing list
> >> llvm-dev at lists.llvm.org
> >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
> >>
> >
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20181128/ad3bdfe3/attachment.html>

David Blaikie via llvm-dev

2018-Nov-28 15:17 UTC

head link

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

On Wed, Nov 28, 2018 at 7:11 AM Sanjay Patel via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Thanks for reporting this and other perf opportunities. As I mentioned
> before, if you could file bug reports for these, that's probably the
only
> way they're ever going to get fixed (unless you're planning to fix
them
> yourself). It's not an ideal situation, but that's how this project
works
> in my experience.
>
> Someone filed 1 of your examples on your behalf here:
> https://bugs.llvm.org/show_bug.cgi?id=39793
>
> Please do consider following that model for your other problems. Without
> that kind of report, your code examples are likely to be forgotten.
>
> Finally, sending mail to llvm-bugs is practically useless. I don't know
of
> anyone that is tracking mails to that list rather than actual bug reports.
>
The list is setup to not accept email, basically - I'm one of the
moderators there & any email sent there that doesn't come from the bug
database will get automatically or manually rejected with the
suggestion/request that the bug database is used instead.

>
> On Tue, Nov 27, 2018 at 4:37 PM Stefan Kanthak <stefan.kanthak at
nexgo.de>
> wrote:
>
>> "Sanjay Patel" <spatel at rotateright.com> wrote:
>>
>> > IIUC, you want to use x86-specific bit-hacks (sbb masking) in
cases like
>> > this:
>> > unsigned int foo(unsigned int crc) {
>> >    if (crc & 0x80000000)
>> >      crc <<= 1, crc ^= 0xEDB88320;
>> >    else
>> >      crc <<= 1;
>> >    return crc;
>> > }
>>
>> To document this for x86 too: rewrite the function slightly
>>
>> unsigned int foo(unsigned int crc, unsigned int poly) {
>>     if (crc & 0x80000000)
>>         crc <<= 1, crc ^= poly;
>>     else
>>         crc <<= 1;
>>     return crc;
>> }
>>
>> unsigned int bar(unsigned int crc, unsigned int poly) {
>>     if (crc & 1)
>>         crc >>= 1, crc ^= poly;
>>     else
>>         crc >>= 1;
>>     return crc;
>> }
>>
>> and you get the perfect code for the left-shifting case!
>>
>> foo: # @foo
>>     lea eax, [rdi + rdi]
>>     sar edi, 31
>>     and edi, esi
>>     xor eax, edi
>>     ret
>>
>> The right-shifting case leaves but still room for improvement!
>>
>> bar: # @bar                | bar: # @bar
>>     mov eax, edi           |
>>     and eax, 1             |
>>     neg eax                |
>>     shr edi                |     shr edi
>>                            |     sbb eax, eax
>>     and eax, esi           |     and eax, esi
>>     xor eax, edi           |     xor eax, edi
>>     ret                    |     ret
>>
>> See <https://godbolt.org/z/aPKweG>
>>
>> regards
>> Stefan Kanthak
>>
>> > Which is this in LLVM IR:
>> > define i32 @foo(i32 %x) {
>> >  %t2 = icmp slt i32 %x, 0
>> >  %t3 = shl i32 %x, 1
>> >  %t4 = xor i32 %t3, -306674912
>> >  %t5 = select i1 %t2, i32 %t4, i32 %t3
>> >  ret i32 %t5
>> > }
>> >
>> > Please a file a bug report for the x86 backend (including
performance
>> > numbers if you have that data).
>> >
>> > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
>> > llvm-dev at lists.llvm.org> wrote:
>> >
>> >> Hi @ll,
>> >>
>> >> while clang/LLVM recognizes common bit-twiddling
idioms/expressions
>> >> like
>> >>
>> >> unsigned int rotate(unsigned int x, unsigned int n)
>> >> {
>> >>     return (x << n) | (x >> (32 - n));
>> >> }
>> >>
>> >> and typically generates "rotate" machine
instructions for this
>> >> expression, it fails to recognize other also common
bit-twiddling
>> >> idioms/expressions.
>> >>
>> >> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
>> >> (see <https://tools.ietf.org/html/rfc1952#section-8> for
example):
>> >>
>> >> unsigned int crc32be(unsigned char const *octets, unsigned int
count)
>> >> {
>> >>     unsigned int crc = 0L;
>> >>     unsigned int i;
>> >>
>> >>     while (count--) {
>> >>         crc ^= *octets++ << 24;
>> >>         for (i = 8; i > 0; i--)
>> >>             if (crc & 0x80000000L)             // the code
generated
>> >>                 crc <<= 1, crc ^= 0xEDB88320L; // for
Intel x86 from
>> >>             else                               // these 4
lines is
>> >>                 crc <<= 1;                     // rather
poor!
>> >>     }
>> >>     return crc;
>> >> }
>> >>
>> >> The same function for "little endian" byte order,
using the "inverse"
>> >> or "mirrored" polynom:
>> >>
>> >> unsigned int crc32le(unsigned char const *octets, unsigned int
count)
>> >> {
>> >>     unsigned int crc = ~0L;
>> >>     unsigned int i;
>> >>
>> >>     while (count--) {
>> >>         crc ^= *octets++;
>> >>         for (i = 8; i > 0; i--)
>> >>             if (crc & 1L)                      // the code
generated
>> >>                 crc >>= 1, crc ^= 0x04C11DB7L; // for
Intel x86 from
>> >>             else                               // these 4
lines is
>> >>                 crc >>= 1;                     // rather
poor!
>> >>     }
>> >>     return ~crc;
>> >> }
>> >>
>> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <
>> https://godbolt.org/z/zeExHm>
>> >> (-O2)
>> >>
>> >> crc32be: # @crc32be
>> >>         xor    eax, eax
>> >>         test   esi, esi
>> >>         jne    .LBB0_2
>> >>         jmp    .LBB0_5
>> >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>> >>         add    rdi, 1
>> >>         test   esi, esi
>> >>         je    .LBB0_5
>> >> .LBB0_2: # =>This Loop Header: Depth=1
>> >>         add    esi, -1
>> >>         movzx  edx, byte ptr [rdi]
>> >>         shl    edx, 24
>> >>         xor    edx, eax
>> >>         mov    ecx, -8
>> >>         mov    eax, edx
>> >> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions
instead of
>> 6, r8
>> >> not clobbered!
>> >>         lea    r8d, [rax + rax]        |     add   eax, eax
>> >>         mov    edx, r8d                | # CF is set from the
MSB of
>> EAX
>> >>         xor    edx, -306674912         |     sbb   edx, edx
>> >>         test   eax, eax                | # EDX is 0xFFFFFFFF
if CF set,
>> >> else 0
>> >>         mov    eax, edx                |     and   edx,
-306674912
>> >>         cmovns eax, r8d                |     xor   eax, edx
>> >>         add    ecx, 1
>> >>         jne    .LBB0_3
>> >>         jmp    .LBB0_4
>> >> .LBB0_5:
>> >>         ret
>> >> crc32le: # @crc32le
>> >>         test   esi, esi
>> >>         je    .LBB1_1
>> >>         mov    eax, -1
>> >> .LBB1_4: # =>This Loop Header: Depth=1
>> >>         add    esi, -1
>> >>         movzx  ecx, byte ptr [rdi]
>> >>         xor    eax, ecx
>> >>         mov    r8d, -8
>> >> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions
instead of
>> 7, and
>> >>         mov    edx, eax                | #  neither r8 nor rcx
>> clobbered!
>> >>         shr    edx                     |     shr   eax, 1
>> >>         mov    ecx, edx                | # CF is set from the
LSB of
>> EAX
>> >>         xor    ecx, 79764919           |     sbb   edx, edx
>> >>         test   al, 1                   | # EDX is 0xFFFFFFFF
if CF set,
>> >> else 0
>> >>         mov    eax, ecx                |     and   edx,
79764919
>> >>         cmove  eax, edx                |     xor   eax, edx
>> >>         add    r8d, 1
>> >>         jne    .LBB1_5
>> >>         add    rdi, 1
>> >>         test   esi, esi
>> >>         jne    .LBB1_4
>> >>         not    eax
>> >>         ret
>> >> .LBB1_1:
>> >>         xor    eax, eax
>> >>         ret
>> >>
>> >> JFTR: with -O2, the inner loop gets unrolled, using the same
>> non-optimal
>> >>       code sequence with 6 and 7 instructions; this accounts
for a
>> total
>> >>       of 16 and 24 superfluous instructions respectively.
>> >> _______________________________________________
>> >> LLVM Developers mailing list
>> >> llvm-dev at lists.llvm.org
>> >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>> >>
>> >
>>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20181128/30e5f318/attachment-0001.html>

llvm dev - Nov 2018 - Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)