Stefan Kanthak via llvm-dev
2018-Nov-06 15:33 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
Hi @ll,
while clang/LLVM recognizes common bit-twiddling idioms/expressions
like
unsigned int rotate(unsigned int x, unsigned int n)
{
return (x << n) | (x >> (32 - n));
}
and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.
The standard IEEE CRC-32 for "big endian" alias "network"
byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):
unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
unsigned int crc = 0L;
unsigned int i;
while (count--) {
crc ^= *octets++ << 24;
for (i = 8; i > 0; i--)
if (crc & 0x80000000L) // the code generated
crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
else // these 4 lines is
crc <<= 1; // rather poor!
}
return crc;
}
The same function for "little endian" byte order, using the
"inverse"
or "mirrored" polynom:
unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
unsigned int crc = ~0L;
unsigned int i;
while (count--) {
crc ^= *octets++;
for (i = 8; i > 0; i--)
if (crc & 1L) // the code generated
crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
else // these 4 lines is
crc >>= 1; // rather poor!
}
return ~crc;
}
See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm> (-O2)
crc32be: # @crc32be
xor eax, eax
test esi, esi
jne .LBB0_2
jmp .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
add rdi, 1
test esi, esi
je .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
add esi, -1
movzx edx, byte ptr [rdi]
shl edx, 24
xor edx, eax
mov ecx, -8
mov eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 not
clobbered!
lea r8d, [rax + rax] | add eax, eax
mov edx, r8d | # CF is set from the MSB of EAX
xor edx, -306674912 | sbb edx, edx
test eax, eax | # EDX is 0xFFFFFFFF if CF set, else 0
mov eax, edx | and edx, -306674912
cmovns eax, r8d | xor eax, edx
add ecx, 1
jne .LBB0_3
jmp .LBB0_4
.LBB0_5:
ret
crc32le: # @crc32le
test esi, esi
je .LBB1_1
mov eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
add esi, -1
movzx ecx, byte ptr [rdi]
xor eax, ecx
mov r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and
mov edx, eax | # neither r8 nor rcx clobbered!
shr edx | shr eax, 1
mov ecx, edx | # CF is set from the LSB of EAX
xor ecx, 79764919 | sbb edx, edx
test al, 1 | # EDX is 0xFFFFFFFF if CF set, else 0
mov eax, ecx | and edx, 79764919
cmove eax, edx | xor eax, edx
add r8d, 1
jne .LBB1_5
add rdi, 1
test esi, esi
jne .LBB1_4
not eax
ret
.LBB1_1:
xor eax, eax
ret
JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
code sequence with 6 and 7 instructions; this accounts for a total
of 16 and 24 superfluous instructions respectively.
Sanjay Patel via llvm-dev
2018-Nov-08 19:08 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
this:
unsigned int foo(unsigned int crc) {
if (crc & 0x80000000)
crc <<= 1, crc ^= 0xEDB88320;
else
crc <<= 1;
return crc;
}
Which is this in LLVM IR:
define i32 @foo(i32 %x) {
%t2 = icmp slt i32 %x, 0
%t3 = shl i32 %x, 1
%t4 = xor i32 %t3, -306674912
%t5 = select i1 %t2, i32 %t4, i32 %t3
ret i32 %t5
}
Please a file a bug report for the x86 backend (including performance
numbers if you have that data).
On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Hi @ll,
>
> while clang/LLVM recognizes common bit-twiddling idioms/expressions
> like
>
> unsigned int rotate(unsigned int x, unsigned int n)
> {
> return (x << n) | (x >> (32 - n));
> }
>
> and typically generates "rotate" machine instructions for this
> expression, it fails to recognize other also common bit-twiddling
> idioms/expressions.
>
> The standard IEEE CRC-32 for "big endian" alias
"network" byte order
> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>
> unsigned int crc32be(unsigned char const *octets, unsigned int count)
> {
> unsigned int crc = 0L;
> unsigned int i;
>
> while (count--) {
> crc ^= *octets++ << 24;
> for (i = 8; i > 0; i--)
> if (crc & 0x80000000L) // the code generated
> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
> else // these 4 lines is
> crc <<= 1; // rather poor!
> }
> return crc;
> }
>
> The same function for "little endian" byte order, using the
"inverse"
> or "mirrored" polynom:
>
> unsigned int crc32le(unsigned char const *octets, unsigned int count)
> {
> unsigned int crc = ~0L;
> unsigned int i;
>
> while (count--) {
> crc ^= *octets++;
> for (i = 8; i > 0; i--)
> if (crc & 1L) // the code generated
> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
> else // these 4 lines is
> crc >>= 1; // rather poor!
> }
> return ~crc;
> }
>
> See <https://godbolt.org/z/eYJeWt> (-O1) and
<https://godbolt.org/z/zeExHm>
> (-O2)
>
> crc32be: # @crc32be
> xor eax, eax
> test esi, esi
> jne .LBB0_2
> jmp .LBB0_5
> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
> add rdi, 1
> test esi, esi
> je .LBB0_5
> .LBB0_2: # =>This Loop Header: Depth=1
> add esi, -1
> movzx edx, byte ptr [rdi]
> shl edx, 24
> xor edx, eax
> mov ecx, -8
> mov eax, edx
> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8
> not clobbered!
> lea r8d, [rax + rax] | add eax, eax
> mov edx, r8d | # CF is set from the MSB of EAX
> xor edx, -306674912 | sbb edx, edx
> test eax, eax | # EDX is 0xFFFFFFFF if CF set,
> else 0
> mov eax, edx | and edx, -306674912
> cmovns eax, r8d | xor eax, edx
> add ecx, 1
> jne .LBB0_3
> jmp .LBB0_4
> .LBB0_5:
> ret
> crc32le: # @crc32le
> test esi, esi
> je .LBB1_1
> mov eax, -1
> .LBB1_4: # =>This Loop Header: Depth=1
> add esi, -1
> movzx ecx, byte ptr [rdi]
> xor eax, ecx
> mov r8d, -8
> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and
> mov edx, eax | # neither r8 nor rcx clobbered!
> shr edx | shr eax, 1
> mov ecx, edx | # CF is set from the LSB of EAX
> xor ecx, 79764919 | sbb edx, edx
> test al, 1 | # EDX is 0xFFFFFFFF if CF set,
> else 0
> mov eax, ecx | and edx, 79764919
> cmove eax, edx | xor eax, edx
> add r8d, 1
> jne .LBB1_5
> add rdi, 1
> test esi, esi
> jne .LBB1_4
> not eax
> ret
> .LBB1_1:
> xor eax, eax
> ret
>
> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
> code sequence with 6 and 7 instructions; this accounts for a total
> of 16 and 24 superfluous instructions respectively.
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20181108/5f1c5c30/attachment.html>
Stefan Kanthak via llvm-dev
2018-Nov-08 21:49 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > }Generalize this a little bit: the optimizer "knows" that (crc & 0x80000000) is equivalent to testing the sign-bit, which sets SF on x86. On x86, both (crc <<= 1) and (crc += crc) shift the sign-bit into CF, so there is no need for an explicit "test crc, crc" in the above case: testing SF before the shift is equivalent to testing CF after the shift. I expect that the optimizer "knows" about this equivalence. If it doesn't take "sbb masking" into account, the above code might also be translated to lea edx, [eax+eax] xor edx, 0EDB88320h add eax, eax cmovc eax, edx The same holds for the opposite case: for (crc & 1) followed by (crc >>= 1) there is no need for an explicit "test crc, 1" since the right shift "moves" the LSB into CF. regards Stefan> Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data). > > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >>
Stefan Kanthak via llvm-dev
2018-Nov-27 23:03 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > } > > Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data).JFTR: as soon as the ternary operator is moved into a function, LLVM/clang performs an EQUIVALENT optimisation for the left- shifting CRC/LFSR, for both for the x86 and x86-64: see <https://godbolt.org/z/J1KY2d> The right-shifting CRC/LFSR is but still NOT optimal! --- test.c --- unsigned long long lfsr64right(unsigned long long argument, unsigned long long polynomial) { return argument & 1 ? polynomial ^ (argument >> 1) : argument >> 1; } ... unsigned long long lfsr64left(unsigned long long argument, unsigned long long polynomial) { return (long long) argument < 0 ? polynomial ^ (argument << 1) : argument << 1; } ... --- EOF --- lfsr64right: # @lfsr64right ;;; remove these 3 instructions ;;; mov eax, edi ;;; and eax, 1 ;;; neg rax shr rdi ;;; add the next instruction sbb rax, rax and rax, rsi xor rax, rdi ret ... lfsr64left: # @lfsr64left lea rax, [rdi + rdi] sar rdi, 63 and rdi, rsi xor rax, rdi ret These 5 instructions are perfect! If now LLVM/clang would use this sequence without moving the ternary operator into its own function (which is inlined anyway)... regards Stefan Kanthak> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >
Stefan Kanthak via llvm-dev
2018-Nov-27 23:37 UTC
[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
"Sanjay Patel" <spatel at rotateright.com> wrote:> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like > this: > unsigned int foo(unsigned int crc) { > if (crc & 0x80000000) > crc <<= 1, crc ^= 0xEDB88320; > else > crc <<= 1; > return crc; > }To document this for x86 too: rewrite the function slightly unsigned int foo(unsigned int crc, unsigned int poly) { if (crc & 0x80000000) crc <<= 1, crc ^= poly; else crc <<= 1; return crc; } unsigned int bar(unsigned int crc, unsigned int poly) { if (crc & 1) crc >>= 1, crc ^= poly; else crc >>= 1; return crc; } and you get the perfect code for the left-shifting case! foo: # @foo lea eax, [rdi + rdi] sar edi, 31 and edi, esi xor eax, edi ret The right-shifting case leaves but still room for improvement! bar: # @bar | bar: # @bar mov eax, edi | and eax, 1 | neg eax | shr edi | shr edi | sbb eax, eax and eax, esi | and eax, esi xor eax, edi | xor eax, edi ret | ret See <https://godbolt.org/z/aPKweG> regards Stefan Kanthak> Which is this in LLVM IR: > define i32 @foo(i32 %x) { > %t2 = icmp slt i32 %x, 0 > %t3 = shl i32 %x, 1 > %t4 = xor i32 %t3, -306674912 > %t5 = select i1 %t2, i32 %t4, i32 %t3 > ret i32 %t5 > } > > Please a file a bug report for the x86 backend (including performance > numbers if you have that data). > > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> Hi @ll, >> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions >> like >> >> unsigned int rotate(unsigned int x, unsigned int n) >> { >> return (x << n) | (x >> (32 - n)); >> } >> >> and typically generates "rotate" machine instructions for this >> expression, it fails to recognize other also common bit-twiddling >> idioms/expressions. >> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example): >> >> unsigned int crc32be(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = 0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++ << 24; >> for (i = 8; i > 0; i--) >> if (crc & 0x80000000L) // the code generated >> crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from >> else // these 4 lines is >> crc <<= 1; // rather poor! >> } >> return crc; >> } >> >> The same function for "little endian" byte order, using the "inverse" >> or "mirrored" polynom: >> >> unsigned int crc32le(unsigned char const *octets, unsigned int count) >> { >> unsigned int crc = ~0L; >> unsigned int i; >> >> while (count--) { >> crc ^= *octets++; >> for (i = 8; i > 0; i--) >> if (crc & 1L) // the code generated >> crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from >> else // these 4 lines is >> crc >>= 1; // rather poor! >> } >> return ~crc; >> } >> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> >> (-O2) >> >> crc32be: # @crc32be >> xor eax, eax >> test esi, esi >> jne .LBB0_2 >> jmp .LBB0_5 >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1 >> add rdi, 1 >> test esi, esi >> je .LBB0_5 >> .LBB0_2: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx edx, byte ptr [rdi] >> shl edx, 24 >> xor edx, eax >> mov ecx, -8 >> mov eax, edx >> .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 >> not clobbered! >> lea r8d, [rax + rax] | add eax, eax >> mov edx, r8d | # CF is set from the MSB of EAX >> xor edx, -306674912 | sbb edx, edx >> test eax, eax | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, edx | and edx, -306674912 >> cmovns eax, r8d | xor eax, edx >> add ecx, 1 >> jne .LBB0_3 >> jmp .LBB0_4 >> .LBB0_5: >> ret >> crc32le: # @crc32le >> test esi, esi >> je .LBB1_1 >> mov eax, -1 >> .LBB1_4: # =>This Loop Header: Depth=1 >> add esi, -1 >> movzx ecx, byte ptr [rdi] >> xor eax, ecx >> mov r8d, -8 >> .LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and >> mov edx, eax | # neither r8 nor rcx clobbered! >> shr edx | shr eax, 1 >> mov ecx, edx | # CF is set from the LSB of EAX >> xor ecx, 79764919 | sbb edx, edx >> test al, 1 | # EDX is 0xFFFFFFFF if CF set, >> else 0 >> mov eax, ecx | and edx, 79764919 >> cmove eax, edx | xor eax, edx >> add r8d, 1 >> jne .LBB1_5 >> add rdi, 1 >> test esi, esi >> jne .LBB1_4 >> not eax >> ret >> .LBB1_1: >> xor eax, eax >> ret >> >> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal >> code sequence with 6 and 7 instructions; this accounts for a total >> of 16 and 24 superfluous instructions respectively. >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >
Seemingly Similar Threads
- Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
- Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
- [RFC] New pass: LoopExitValues
- [RFC] New pass: LoopExitValues
- Where's the optimiser gone (part 11): use the proper instruction for sign extension