thr3ads.net - llvm dev - [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32) [Nov 2018]

If this information is useful, please help other people find it:
Share via:

Stefan Kanthak via llvm-dev

2018-Nov-06 15:33 UTC

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Hi @ll,

while clang/LLVM recognizes common bit-twiddling idioms/expressions
like

unsigned int rotate(unsigned int x, unsigned int n)
{
    return (x << n) | (x >> (32 - n));
}

and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.

The standard IEEE CRC-32 for "big endian" alias "network"
byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):

unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = 0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++ << 24;
        for (i = 8; i > 0; i--)
            if (crc & 0x80000000L)             // the code generated
                crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
            else                               // these 4 lines is
                crc <<= 1;                     // rather poor!
    }
    return crc;
}

The same function for "little endian" byte order, using the
"inverse"
or "mirrored" polynom:

unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = ~0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++;
        for (i = 8; i > 0; i--)
            if (crc & 1L)                      // the code generated
                crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
            else                               // these 4 lines is
                crc >>= 1;                     // rather poor!
    }
    return ~crc;
}

See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm> (-O2)

crc32be: # @crc32be
        xor    eax, eax
        test   esi, esi
        jne    .LBB0_2
        jmp    .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
        add    rdi, 1
        test   esi, esi
        je    .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  edx, byte ptr [rdi]
        shl    edx, 24
        xor    edx, eax
        mov    ecx, -8
        mov    eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8 not
clobbered!
        lea    r8d, [rax + rax]        |     add   eax, eax
        mov    edx, r8d                | # CF is set from the MSB of EAX
        xor    edx, -306674912         |     sbb   edx, edx
        test   eax, eax                | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, edx                |     and   edx, -306674912
        cmovns eax, r8d                |     xor   eax, edx
        add    ecx, 1
        jne    .LBB0_3
        jmp    .LBB0_4
.LBB0_5:
        ret
crc32le: # @crc32le
        test   esi, esi
        je    .LBB1_1
        mov    eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  ecx, byte ptr [rdi]
        xor    eax, ecx
        mov    r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
        mov    edx, eax                | #  neither r8 nor rcx clobbered!
        shr    edx                     |     shr   eax, 1
        mov    ecx, edx                | # CF is set from the LSB of EAX
        xor    ecx, 79764919           |     sbb   edx, edx
        test   al, 1                   | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, ecx                |     and   edx, 79764919
        cmove  eax, edx                |     xor   eax, edx
        add    r8d, 1
        jne    .LBB1_5
        add    rdi, 1
        test   esi, esi
        jne    .LBB1_4
        not    eax
        ret
.LBB1_1:
        xor    eax, eax
        ret

JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
      code sequence with 6 and 7 instructions; this accounts for a total
      of 16 and 24 superfluous instructions respectively.

Sanjay Patel via llvm-dev

2018-Nov-08 19:08 UTC

head link

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
this:
unsigned int foo(unsigned int crc) {
    if (crc & 0x80000000)
      crc <<= 1, crc ^= 0xEDB88320;
    else
      crc <<= 1;
    return crc;
}

Which is this in LLVM IR:
define i32 @foo(i32 %x) {
  %t2 = icmp slt i32 %x, 0
  %t3 = shl i32 %x, 1
  %t4 = xor i32 %t3, -306674912
  %t5 = select i1 %t2, i32 %t4, i32 %t3
  ret i32 %t5
}

Please a file a bug report for the x86 backend (including performance
numbers if you have that data).

On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Hi @ll,
>
> while clang/LLVM recognizes common bit-twiddling idioms/expressions
> like
>
> unsigned int rotate(unsigned int x, unsigned int n)
> {
>     return (x << n) | (x >> (32 - n));
> }
>
> and typically generates "rotate" machine instructions for this
> expression, it fails to recognize other also common bit-twiddling
> idioms/expressions.
>
> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>
> unsigned int crc32be(unsigned char const *octets, unsigned int count)
> {
>     unsigned int crc = 0L;
>     unsigned int i;
>
>     while (count--) {
>         crc ^= *octets++ << 24;
>         for (i = 8; i > 0; i--)
>             if (crc & 0x80000000L)             // the code generated
>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>             else                               // these 4 lines is
>                 crc <<= 1;                     // rather poor!
>     }
>     return crc;
> }
>
> The same function for "little endian" byte order, using the
"inverse"
> or "mirrored" polynom:
>
> unsigned int crc32le(unsigned char const *octets, unsigned int count)
> {
>     unsigned int crc = ~0L;
>     unsigned int i;
>
>     while (count--) {
>         crc ^= *octets++;
>         for (i = 8; i > 0; i--)
>             if (crc & 1L)                      // the code generated
>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>             else                               // these 4 lines is
>                 crc >>= 1;                     // rather poor!
>     }
>     return ~crc;
> }
>
> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
> (-O2)
>
> crc32be: # @crc32be
>         xor    eax, eax
>         test   esi, esi
>         jne    .LBB0_2
>         jmp    .LBB0_5
> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>         add    rdi, 1
>         test   esi, esi
>         je    .LBB0_5
> .LBB0_2: # =>This Loop Header: Depth=1
>         add    esi, -1
>         movzx  edx, byte ptr [rdi]
>         shl    edx, 24
>         xor    edx, eax
>         mov    ecx, -8
>         mov    eax, edx
> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
> not clobbered!
>         lea    r8d, [rax + rax]        |     add   eax, eax
>         mov    edx, r8d                | # CF is set from the MSB of EAX
>         xor    edx, -306674912         |     sbb   edx, edx
>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
> else 0
>         mov    eax, edx                |     and   edx, -306674912
>         cmovns eax, r8d                |     xor   eax, edx
>         add    ecx, 1
>         jne    .LBB0_3
>         jmp    .LBB0_4
> .LBB0_5:
>         ret
> crc32le: # @crc32le
>         test   esi, esi
>         je    .LBB1_1
>         mov    eax, -1
> .LBB1_4: # =>This Loop Header: Depth=1
>         add    esi, -1
>         movzx  ecx, byte ptr [rdi]
>         xor    eax, ecx
>         mov    r8d, -8
> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>         shr    edx                     |     shr   eax, 1
>         mov    ecx, edx                | # CF is set from the LSB of EAX
>         xor    ecx, 79764919           |     sbb   edx, edx
>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
> else 0
>         mov    eax, ecx                |     and   edx, 79764919
>         cmove  eax, edx                |     xor   eax, edx
>         add    r8d, 1
>         jne    .LBB1_5
>         add    rdi, 1
>         test   esi, esi
>         jne    .LBB1_4
>         not    eax
>         ret
> .LBB1_1:
>         xor    eax, eax
>         ret
>
> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>       code sequence with 6 and 7 instructions; this accounts for a total
>       of 16 and 24 superfluous instructions respectively.
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20181108/5f1c5c30/attachment.html>

Stefan Kanthak via llvm-dev

2018-Nov-08 21:49 UTC

head link

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

"Sanjay Patel" <spatel at rotateright.com> wrote:
> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }
Generalize this a little bit: the optimizer "knows" that
(crc & 0x80000000) is equivalent to testing the sign-bit,
which sets SF on x86.

On x86, both (crc <<= 1) and (crc += crc) shift the sign-bit
into CF, so there is no need for an explicit "test crc, crc"
in the above case: testing SF before the shift is equivalent
to testing CF after the shift.
I expect that the optimizer "knows" about this equivalence.
If it doesn't take "sbb masking" into account, the above code
might also be translated to

    lea    edx, [eax+eax]
    xor    edx, 0EDB88320h
    add    eax, eax
    cmovc  eax, edx

The same holds for the opposite case: for (crc & 1) followed
by (crc >>= 1) there is no need for an explicit "test crc, 1"
since the right shift "moves" the LSB into CF.

regards
Stefan
> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
> 
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
> 
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
> 
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for
this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for
example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code
generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the
"inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code
generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6,
r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of
EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7,
and
>>         mov    edx, eax                | #  neither r8 nor rcx
clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of
EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same
non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a
total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>

Stefan Kanthak via llvm-dev

2018-Nov-27 23:03 UTC

head link

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

"Sanjay Patel" <spatel at rotateright.com> wrote:
> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }
> 
> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
> 
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
JFTR: as soon as the ternary operator is moved into a function,
      LLVM/clang performs an EQUIVALENT optimisation for the left-
      shifting CRC/LFSR, for both for the x86 and x86-64: see
      <https://godbolt.org/z/J1KY2d>

      The right-shifting CRC/LFSR is but still NOT optimal!

--- test.c ---

unsigned long long lfsr64right(unsigned long long argument, unsigned long long
polynomial)
{
    return argument & 1 ? polynomial ^ (argument >> 1) : argument
>> 1;
}

...

unsigned long long lfsr64left(unsigned long long argument, unsigned long long
polynomial)
{
    return (long long) argument < 0 ? polynomial ^ (argument << 1) :
argument << 1;
}

...

--- EOF ---

lfsr64right: # @lfsr64right
;;; remove these 3 instructions
;;; mov    eax, edi
;;; and    eax, 1
;;; neg    rax
    shr    rdi
;;; add the next instruction
    sbb    rax, rax
    and    rax, rsi
    xor    rax, rdi
    ret

...

lfsr64left: # @lfsr64left
    lea    rax, [rdi + rdi]
    sar    rdi, 63
    and    rdi, rsi
    xor    rax, rdi
    ret

These 5 instructions are perfect!
If now LLVM/clang would use this sequence without moving the
ternary operator into its own function (which is inlined anyway)...

regards
Stefan Kanthak
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
> 
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for
this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for
example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code
generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the
"inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code
generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6,
r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of
EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7,
and
>>         mov    edx, eax                | #  neither r8 nor rcx
clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of
EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same
non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a
total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>

Stefan Kanthak via llvm-dev

2018-Nov-27 23:37 UTC

head link

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

"Sanjay Patel" <spatel at rotateright.com> wrote:
> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }
To document this for x86 too: rewrite the function slightly

unsigned int foo(unsigned int crc, unsigned int poly) {
    if (crc & 0x80000000)
        crc <<= 1, crc ^= poly;
    else
        crc <<= 1;
    return crc;
}

unsigned int bar(unsigned int crc, unsigned int poly) {
    if (crc & 1)
        crc >>= 1, crc ^= poly;
    else
        crc >>= 1;
    return crc;
}

and you get the perfect code for the left-shifting case!

foo: # @foo
    lea eax, [rdi + rdi]
    sar edi, 31
    and edi, esi
    xor eax, edi
    ret

The right-shifting case leaves but still room for improvement!

bar: # @bar                | bar: # @bar
    mov eax, edi           |
    and eax, 1             |
    neg eax                |
    shr edi                |     shr edi
                           |     sbb eax, eax
    and eax, esi           |     and eax, esi
    xor eax, edi           |     xor eax, edi
    ret                    |     ret

See <https://godbolt.org/z/aPKweG>

regards
Stefan Kanthak
> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
> 
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
> 
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
> 
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for
this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for
example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code
generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the
"inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code
generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86
from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6,
r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of
EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7,
and
>>         mov    edx, eax                | #  neither r8 nor rcx
clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of
EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same
non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a
total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>

Possibly Parallel Threads

Search for more apparently analagous threads

llvm dev - Nov 2018 - Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Possibly Parallel Threads