Stefan Kanthak via llvm-dev
2018-Nov-06 15:33 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
Hi @ll, while clang/LLVM recognizes common bit-twiddling idioms/expressions like unsigned int rotate(unsigned int x, unsigned int n) { return (x << n) | (x >> (32 - n)); } and typically generates "rotate" machine instructions for this expression, it fails to recognize other also common bit-twiddling idioms/expressions. The standard IEEE CRC-32 for "big endian" alias "network" byte order (see <https://tools.ietf.org/html/rfc1952#section-8> for example): unsigned int crc32be(unsigned char const *octets, unsigned int count) { unsigned int crc = 0L; unsigned int i; while (count--) { crc ^= *octets++ << 24; for (i = 8; i > 0; i--) if (crc & 0x80000000L) // the code generated crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from else // these 4 lines is crc <<= 1; // rather poor! } return crc; } The same function for "little endian" byte order, using the "inverse" or "mirrored" polynom: unsigned int crc32le(unsigned char const *octets, unsigned int count) { unsigned int crc = ~0L; unsigned int i; while (count--) { crc ^= *octets++; for (i = 8; i > 0; i--) if (crc & 1L) // the code generated crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from else // these 4 lines is crc >>= 1; // rather poor! } return ~crc; } See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> (-O2) crc32be: # @crc32be xor eax, eax test esi, esi jne .LBB0_2 jmp .LBB0_5 .LBB0_4: # in Loop: Header=BB0_2 Depth=1 add rdi, 1 test esi, esi je .LBB0_5 .LBB0_2: # =>This Loop Header: Depth=1 add esi, -1 movzx edx, byte ptr [rdi] shl edx, 24 xor edx, eax mov ecx, -8 mov eax, edx .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 not clobbered! lea r8d, [rax + rax] | add eax, eax mov edx, r8d | # CF is set from the MSB of EAX xor edx, -306674912 | sbb edx, edx test eax, eax | # EDX is 0xFFFFFFFF if CF set, else 0 mov eax, edx | and edx, -306674912 cmovns eax, r8d | xor eax, edx add ecx, 1 jne .LBB0_3 jmp .LBB0_4 .LBB0_5: ret crc32le: # @crc32le test esi, esi je .LBB1_1 mov eax, -1 .LBB1_4: # =>This Loop Header: Depth=1 add esi, -1 movzx ecx, byte ptr [rdi] xor eax, ecx mov r8d, -8 .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and mov edx, eax | # neither r8 nor rcx clobbered! shr edx | shr eax, 1 mov ecx, edx | # CF is set from the LSB of EAX xor ecx, 79764919 | sbb edx, edx test al, 1 | # EDX is 0xFFFFFFFF if CF set, else 0 mov eax, ecx | and edx, 79764919 cmove eax, edx | xor eax, edx add r8d, 1 jne .LBB1_5 add rdi, 1 test esi, esi jne .LBB1_4 not eax ret .LBB1_1: xor eax, eax ret JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal code sequence with 6 and 7 instructions; this accounts for a total of 16 and 24 superfluous instructions respectively.
Sanjay Patel via llvm-dev
2018-Nov-08 19:08 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like this: unsigned int foo(unsigned int crc) { if (crc & 0x80000000) crc <<= 1, crc ^= 0xEDB88320; else crc <<= 1; return crc; } Which is this in LLVM IR: define i32 @foo(i32 %x) { %t2 = icmp slt i32 %x, 0 %t3 = shl i32 %x, 1 %t4 = xor i32 %t3, -306674912 %t5 = select i1 %t2, i32 %t4, i32 %t3 ret i32 %t5 } Please a file a bug report for the x86 backend (including performance numbers if you have that data). On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < llvm-dev at lists.llvm.org> wrote:> Hi @ll, > > while clang/LLVM recognizes common bit-twiddling idioms/expressions > like > > unsigned int rotate(unsigned int x, unsigned int n) > { > return (x << n) | (x >> (32 - n)); > } > > and typically generates "rotate" machine instructions for this > expression, it fails to recognize other also common bit-twiddling > idioms/expressions. > > The standard IEEE CRC-32 for "big endian" alias "network" byte order > (see <https://tools.ietf.org/html/rfc1952#section-8> for example): > > unsigned int crc32be(unsigned char const *octets, unsigned int count) > { > unsigned int crc = 0L; > unsigned int i; > > while (count--) { > crc ^= *octets++ << 24; > for (i = 8; i > 0; i--) > if (crc & 0x80000000L) // the code generated > crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from > else // these 4 lines is > crc <<= 1; // rather poor! > } > return crc; > } > > The same function for "little endian" byte order, using the "inverse" > or "mirrored" polynom: > > unsigned int crc32le(unsigned char const *octets, unsigned int count) > { > unsigned int crc = ~0L; > unsigned int i; > > while (count--) { > crc ^= *octets++; > for (i = 8; i > 0; i--) > if (crc & 1L) // the code generated > crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from > else // these 4 lines is > crc >>= 1; // rather poor! > } > return ~crc; > } > > See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> > (-O2) > > crc32be: # @crc32be > xor eax, eax > test esi, esi > jne .LBB0_2 > jmp .LBB0_5 > .LBB0_4: # in Loop: Header=BB0_2 Depth=1 > add rdi, 1 > test esi, esi > je .LBB0_5 > .LBB0_2: # =>This Loop Header: Depth=1 > add esi, -1 > movzx edx, byte ptr [rdi] > shl edx, 24 > xor edx, eax > mov ecx, -8 > mov eax, edx > .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 > not clobbered! > lea r8d, [rax + rax] | add eax, eax > mov edx, r8d | # CF is set from the MSB of EAX > xor edx, -306674912 | sbb edx, edx > test eax, eax | # EDX is 0xFFFFFFFF if CF set, > else 0 > mov eax, edx | and edx, -306674912 > cmovns eax, r8d | xor eax, edx > add ecx, 1 > jne .LBB0_3 > jmp .LBB0_4 > .LBB0_5: > ret > crc32le: # @crc32le > test esi, esi > je .LBB1_1 > mov eax, -1 > .LBB1_4: # =>This Loop Header: Depth=1 > add esi, -1 > movzx ecx, byte ptr [rdi] > xor eax, ecx > mov r8d, -8 > .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and > mov edx, eax | # neither r8 nor rcx clobbered! > shr edx | shr eax, 1 > mov ecx, edx | # CF is set from the LSB of EAX > xor ecx, 79764919 | sbb edx, edx > test al, 1 | # EDX is 0xFFFFFFFF if CF set, > else 0 > mov eax, ecx | and edx, 79764919 > cmove eax, edx | xor eax, edx > add r8d, 1 > jne .LBB1_5 > add rdi, 1 > test esi, esi > jne .LBB1_4 > not eax > ret > .LBB1_1: > xor eax, eax > ret > > JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal > code sequence with 6 and 7 instructions; this accounts for a total > of 16 and 24 superfluous instructions respectively. > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20181108/5f1c5c30/attachment.html>
Stefan Kanthak via llvm-dev
2018-Nov-08 21:49 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > }Generalize this a little bit: the optimizer "knows" that (crc & 0x80000000) is equivalent to testing the sign-bit, which sets SF on x86. On x86, both (crc <<= 1) and (crc += crc) shift the sign-bit into CF, so there is no need for an explicit "test crc, crc" in the above case: testing SF before the shift is equivalent to testing CF after the shift. I expect that the optimizer "knows" about this equivalence. If it doesn't take "sbb masking" into account, the above code might also be translated to lea edx, [eax+eax] xor edx, 0EDB88320h add eax, eax cmovc eax, edx The same holds for the opposite case: for (crc & 1) followed by (crc >>= 1) there is no need for an explicit "test crc, 1" since the right shift "moves" the LSB into CF. regards Stefan> Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data). > > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >>
Stefan Kanthak via llvm-dev
2018-Nov-27 23:03 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > } > > Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data).JFTR: as soon as the ternary operator is moved into a function, LLVM/clang performs an EQUIVALENT optimisation for the left- shifting CRC/LFSR, for both for the x86 and x86-64: see <https://godbolt.org/z/J1KY2d> The right-shifting CRC/LFSR is but still NOT optimal! --- test.c --- unsigned long long lfsr64right(unsigned long long argument, unsigned long long polynomial) { return argument & 1 ? polynomial ^ (argument >> 1) : argument >> 1; } ... unsigned long long lfsr64left(unsigned long long argument, unsigned long long polynomial) { return (long long) argument < 0 ? polynomial ^ (argument << 1) : argument << 1; } ... --- EOF --- lfsr64right: # @lfsr64right ;;; remove these 3 instructions ;;; mov eax, edi ;;; and eax, 1 ;;; neg rax shr rdi ;;; add the next instruction sbb rax, rax and rax, rsi xor rax, rdi ret ... lfsr64left: # @lfsr64left lea rax, [rdi + rdi] sar rdi, 63 and rdi, rsi xor rax, rdi ret These 5 instructions are perfect! If now LLVM/clang would use this sequence without moving the ternary operator into its own function (which is inlined anyway)... regards Stefan Kanthak> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >
Stefan Kanthak via llvm-dev
2018-Nov-27 23:37 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > }To document this for x86 too: rewrite the function slightly unsigned int foo(unsigned int crc, unsigned int poly) { if (crc & 0x80000000) crc <<= 1, crc ^= poly; else crc <<= 1; return crc; } unsigned int bar(unsigned int crc, unsigned int poly) { if (crc & 1) crc >>= 1, crc ^= poly; else crc >>= 1; return crc; } and you get the perfect code for the left-shifting case! foo: # @foo lea eax, [rdi + rdi] sar edi, 31 and edi, esi xor eax, edi ret The right-shifting case leaves but still room for improvement! bar: # @bar | bar: # @bar mov eax, edi | and eax, 1 | neg eax | shr edi | shr edi | sbb eax, eax and eax, esi | and eax, esi xor eax, edi | xor eax, edi ret | ret See <https://godbolt.org/z/aPKweG> regards Stefan Kanthak> Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data). > > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >