Stefan Kanthak via llvm-dev
2018-Nov-06  15:33 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
Hi @ll,
while clang/LLVM recognizes common bit-twiddling idioms/expressions
like
unsigned int rotate(unsigned int x, unsigned int n)
{
    return (x << n) | (x >> (32 - n));
}
and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.
The standard IEEE CRC-32 for "big endian" alias "network"
byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):
unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = 0L;
    unsigned int i;
    while (count--) {
        crc ^= *octets++ << 24;
        for (i = 8; i > 0; i--)
            if (crc & 0x80000000L)             // the code generated
                crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
            else                               // these 4 lines is
                crc <<= 1;                     // rather poor!
    }
    return crc;
}
The same function for "little endian" byte order, using the
"inverse"
or "mirrored" polynom:
unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = ~0L;
    unsigned int i;
    while (count--) {
        crc ^= *octets++;
        for (i = 8; i > 0; i--)
            if (crc & 1L)                      // the code generated
                crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
            else                               // these 4 lines is
                crc >>= 1;                     // rather poor!
    }
    return ~crc;
}
See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm> (-O2)
crc32be: # @crc32be
        xor    eax, eax
        test   esi, esi
        jne    .LBB0_2
        jmp    .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
        add    rdi, 1
        test   esi, esi
        je    .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  edx, byte ptr [rdi]
        shl    edx, 24
        xor    edx, eax
        mov    ecx, -8
        mov    eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8 not
clobbered!
        lea    r8d, [rax + rax]        |     add   eax, eax
        mov    edx, r8d                | # CF is set from the MSB of EAX
        xor    edx, -306674912         |     sbb   edx, edx
        test   eax, eax                | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, edx                |     and   edx, -306674912
        cmovns eax, r8d                |     xor   eax, edx
        add    ecx, 1
        jne    .LBB0_3
        jmp    .LBB0_4
.LBB0_5:
        ret
crc32le: # @crc32le
        test   esi, esi
        je    .LBB1_1
        mov    eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  ecx, byte ptr [rdi]
        xor    eax, ecx
        mov    r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
        mov    edx, eax                | #  neither r8 nor rcx clobbered!
        shr    edx                     |     shr   eax, 1
        mov    ecx, edx                | # CF is set from the LSB of EAX
        xor    ecx, 79764919           |     sbb   edx, edx
        test   al, 1                   | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, ecx                |     and   edx, 79764919
        cmove  eax, edx                |     xor   eax, edx
        add    r8d, 1
        jne    .LBB1_5
        add    rdi, 1
        test   esi, esi
        jne    .LBB1_4
        not    eax
        ret
.LBB1_1:
        xor    eax, eax
        ret
JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
      code sequence with 6 and 7 instructions; this accounts for a total
      of 16 and 24 superfluous instructions respectively.
Sanjay Patel via llvm-dev
2018-Nov-08  19:08 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
this:
unsigned int foo(unsigned int crc) {
    if (crc & 0x80000000)
      crc <<= 1, crc ^= 0xEDB88320;
    else
      crc <<= 1;
    return crc;
}
Which is this in LLVM IR:
define i32 @foo(i32 %x) {
  %t2 = icmp slt i32 %x, 0
  %t3 = shl i32 %x, 1
  %t4 = xor i32 %t3, -306674912
  %t5 = select i1 %t2, i32 %t4, i32 %t3
  ret i32 %t5
}
Please a file a bug report for the x86 backend (including performance
numbers if you have that data).
On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Hi @ll,
>
> while clang/LLVM recognizes common bit-twiddling idioms/expressions
> like
>
> unsigned int rotate(unsigned int x, unsigned int n)
> {
>     return (x << n) | (x >> (32 - n));
> }
>
> and typically generates "rotate" machine instructions for this
> expression, it fails to recognize other also common bit-twiddling
> idioms/expressions.
>
> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>
> unsigned int crc32be(unsigned char const *octets, unsigned int count)
> {
>     unsigned int crc = 0L;
>     unsigned int i;
>
>     while (count--) {
>         crc ^= *octets++ << 24;
>         for (i = 8; i > 0; i--)
>             if (crc & 0x80000000L)             // the code generated
>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>             else                               // these 4 lines is
>                 crc <<= 1;                     // rather poor!
>     }
>     return crc;
> }
>
> The same function for "little endian" byte order, using the
"inverse"
> or "mirrored" polynom:
>
> unsigned int crc32le(unsigned char const *octets, unsigned int count)
> {
>     unsigned int crc = ~0L;
>     unsigned int i;
>
>     while (count--) {
>         crc ^= *octets++;
>         for (i = 8; i > 0; i--)
>             if (crc & 1L)                      // the code generated
>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>             else                               // these 4 lines is
>                 crc >>= 1;                     // rather poor!
>     }
>     return ~crc;
> }
>
> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
> (-O2)
>
> crc32be: # @crc32be
>         xor    eax, eax
>         test   esi, esi
>         jne    .LBB0_2
>         jmp    .LBB0_5
> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>         add    rdi, 1
>         test   esi, esi
>         je    .LBB0_5
> .LBB0_2: # =>This Loop Header: Depth=1
>         add    esi, -1
>         movzx  edx, byte ptr [rdi]
>         shl    edx, 24
>         xor    edx, eax
>         mov    ecx, -8
>         mov    eax, edx
> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
> not clobbered!
>         lea    r8d, [rax + rax]        |     add   eax, eax
>         mov    edx, r8d                | # CF is set from the MSB of EAX
>         xor    edx, -306674912         |     sbb   edx, edx
>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
> else 0
>         mov    eax, edx                |     and   edx, -306674912
>         cmovns eax, r8d                |     xor   eax, edx
>         add    ecx, 1
>         jne    .LBB0_3
>         jmp    .LBB0_4
> .LBB0_5:
>         ret
> crc32le: # @crc32le
>         test   esi, esi
>         je    .LBB1_1
>         mov    eax, -1
> .LBB1_4: # =>This Loop Header: Depth=1
>         add    esi, -1
>         movzx  ecx, byte ptr [rdi]
>         xor    eax, ecx
>         mov    r8d, -8
> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>         shr    edx                     |     shr   eax, 1
>         mov    ecx, edx                | # CF is set from the LSB of EAX
>         xor    ecx, 79764919           |     sbb   edx, edx
>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
> else 0
>         mov    eax, ecx                |     and   edx, 79764919
>         cmove  eax, edx                |     xor   eax, edx
>         add    r8d, 1
>         jne    .LBB1_5
>         add    rdi, 1
>         test   esi, esi
>         jne    .LBB1_4
>         not    eax
>         ret
> .LBB1_1:
>         xor    eax, eax
>         ret
>
> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>       code sequence with 6 and 7 instructions; this accounts for a total
>       of 16 and 24 superfluous instructions respectively.
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20181108/5f1c5c30/attachment.html>
Stefan Kanthak via llvm-dev
2018-Nov-08  21:49 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > }Generalize this a little bit: the optimizer "knows" that (crc & 0x80000000) is equivalent to testing the sign-bit, which sets SF on x86. On x86, both (crc <<= 1) and (crc += crc) shift the sign-bit into CF, so there is no need for an explicit "test crc, crc" in the above case: testing SF before the shift is equivalent to testing CF after the shift. I expect that the optimizer "knows" about this equivalence. If it doesn't take "sbb masking" into account, the above code might also be translated to lea edx, [eax+eax] xor edx, 0EDB88320h add eax, eax cmovc eax, edx The same holds for the opposite case: for (crc & 1) followed by (crc >>= 1) there is no need for an explicit "test crc, 1" since the right shift "moves" the LSB into CF. regards Stefan> Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data). > > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >>
Stefan Kanthak via llvm-dev
2018-Nov-27  23:03 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > } > > Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data).JFTR: as soon as the ternary operator is moved into a function, LLVM/clang performs an EQUIVALENT optimisation for the left- shifting CRC/LFSR, for both for the x86 and x86-64: see <https://godbolt.org/z/J1KY2d> The right-shifting CRC/LFSR is but still NOT optimal! --- test.c --- unsigned long long lfsr64right(unsigned long long argument, unsigned long long polynomial) { return argument & 1 ? polynomial ^ (argument >> 1) : argument >> 1; } ... unsigned long long lfsr64left(unsigned long long argument, unsigned long long polynomial) { return (long long) argument < 0 ? polynomial ^ (argument << 1) : argument << 1; } ... --- EOF --- lfsr64right: # @lfsr64right ;;; remove these 3 instructions ;;; mov eax, edi ;;; and eax, 1 ;;; neg rax shr rdi ;;; add the next instruction sbb rax, rax and rax, rsi xor rax, rdi ret ... lfsr64left: # @lfsr64left lea rax, [rdi + rdi] sar rdi, 63 and rdi, rsi xor rax, rdi ret These 5 instructions are perfect! If now LLVM/clang would use this sequence without moving the ternary operator into its own function (which is inlined anyway)... regards Stefan Kanthak> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >
Stefan Kanthak via llvm-dev
2018-Nov-27  23:37 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > }To document this for x86 too: rewrite the function slightly unsigned int foo(unsigned int crc, unsigned int poly) { if (crc & 0x80000000) crc <<= 1, crc ^= poly; else crc <<= 1; return crc; } unsigned int bar(unsigned int crc, unsigned int poly) { if (crc & 1) crc >>= 1, crc ^= poly; else crc >>= 1; return crc; } and you get the perfect code for the left-shifting case! foo: # @foo lea eax, [rdi + rdi] sar edi, 31 and edi, esi xor eax, edi ret The right-shifting case leaves but still room for improvement! bar: # @bar | bar: # @bar mov eax, edi | and eax, 1 | neg eax | shr edi | shr edi | sbb eax, eax and eax, esi | and eax, esi xor eax, edi | xor eax, edi ret | ret See <https://godbolt.org/z/aPKweG> regards Stefan Kanthak> Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data). > > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >
Seemingly Similar Threads
- Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
- Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
- [RFC] New pass: LoopExitValues
- [RFC] New pass: LoopExitValues
- Where's the optimiser gone (part 11): use the proper instruction for sign extension