Hello,
When debugging an issue, I noticed that the use of ffs() from bitopts.h
is leading to particularly poor code. (See below for the example, I
suspect there is something causing particularly bad behaviour for an
optimisation step)
The bitop functions are deliberately designed to be compatible with
their libc and compiler builtin variants, and replacing it leads to 0x60
bytes disappearing from the .text section even with the small handful of
uses in the x86 tree. This is because the compiler can optimise far
more around its builtin than our rigid inline assembly.
Would it be acceptable to provide a patch which does a straight replace
of the static inline function with a define? __builtin_ffs is supported
in all compilers we support, but there does not appear to any sane way
to detect the presence of the builtin.
~Andrew
Example (from my upcoming changes to the HPET code):
Code:
static uint32_t free_channels = (1U << num_hpets_used) - 1; # pseudocode,
for brevity
static struct hpet_event_channel * noinline hpet_get_free_channel(void)
{
unsigned ch, tries;
for ( tries = num_hpets_used; tries; --tries )
{
if ( (ch = ffs(free_channels)) == 0 )
break;
--ch;
ASSERT(ch < num_hpets_used);
if ( test_and_clear_bit(ch, &free_channels) )
return &hpet_events[ch];
}
return NULL;
}
With regular ffs:
ffff82d08019e150 <hpet_get_free_channel>:
ffff82d08019e150: 55 push %rbp
ffff82d08019e151: 48 89 e5 mov %rsp,%rbp
ffff82d08019e154: 8b 15 9e 53 0d 00 mov 0xd539e(%rip),%edx
# ffff82d0802734f8 <num_hpets_used>
ffff82d08019e15a: 85 d2 test %edx,%edx
ffff82d08019e15c: 74 7d je ffff82d08019e1db
<hpet_get_free_channel+0x8b>
ffff82d08019e15e: 8b 05 6c 82 17 00 mov 0x17826c(%rip),%eax
# ffff82d0803163d0 <free_channels>
ffff82d08019e164: 48 0f bc c0 bsf %rax,%rax
ffff82d08019e168: 75 07 jne ffff82d08019e171
<hpet_get_free_channel+0x21>
ffff82d08019e16a: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax
ffff82d08019e171: 83 c0 01 add $0x1,%eax
ffff82d08019e174: 74 6c je ffff82d08019e1e2
<hpet_get_free_channel+0x92>
ffff82d08019e176: 83 e8 01 sub $0x1,%eax
ffff82d08019e179: 39 c2 cmp %eax,%edx
ffff82d08019e17b: 76 33 jbe ffff82d08019e1b0
<hpet_get_free_channel+0x60>
ffff82d08019e17d: f0 0f b3 05 4b 82 17 lock btr %eax,0x17824b(%rip)
# ffff82d0803163d0 <free_channels>
ffff82d08019e184: 00
ffff82d08019e185: 19 c9 sbb %ecx,%ecx
ffff82d08019e187: 85 c9 test %ecx,%ecx
ffff82d08019e189: 74 44 je ffff82d08019e1cf
<hpet_get_free_channel+0x7f>
ffff82d08019e18b: eb 33 jmp ffff82d08019e1c0
<hpet_get_free_channel+0x70>
ffff82d08019e18d: 8b 05 3d 82 17 00 mov 0x17823d(%rip),%eax
# ffff82d0803163d0 <free_channels>
ffff82d08019e193: 48 0f bc c0 bsf %rax,%rax
ffff82d08019e197: 75 07 jne ffff82d08019e1a0
<hpet_get_free_channel+0x50>
ffff82d08019e199: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax
ffff82d08019e1a0: 83 c0 01 add $0x1,%eax
ffff82d08019e1a3: 74 44 je ffff82d08019e1e9
<hpet_get_free_channel+0x99>
ffff82d08019e1a5: 83 e8 01 sub $0x1,%eax
ffff82d08019e1a8: 3b 05 4a 53 0d 00 cmp 0xd534a(%rip),%eax
# ffff82d0802734f8 <num_hpets_used>
ffff82d08019e1ae: 72 02 jb ffff82d08019e1b2
<hpet_get_free_channel+0x62>
ffff82d08019e1b0: 0f 0b ud2
ffff82d08019e1b2: f0 0f b3 05 16 82 17 lock btr %eax,0x178216(%rip)
# ffff82d0803163d0 <free_channels>
ffff82d08019e1b9: 00
ffff82d08019e1ba: 19 c9 sbb %ecx,%ecx
ffff82d08019e1bc: 85 c9 test %ecx,%ecx
ffff82d08019e1be: 74 0f je ffff82d08019e1cf
<hpet_get_free_channel+0x7f>
ffff82d08019e1c0: 89 c0 mov %eax,%eax
ffff82d08019e1c2: 48 c1 e0 07 shl $0x7,%rax
ffff82d08019e1c6: 48 03 05 33 53 0d 00 add 0xd5333(%rip),%rax
# ffff82d080273500 <hpet_events>
ffff82d08019e1cd: eb 1f jmp ffff82d08019e1ee
<hpet_get_free_channel+0x9e>
ffff82d08019e1cf: 83 ea 01 sub $0x1,%edx
ffff82d08019e1d2: 75 b9 jne ffff82d08019e18d
<hpet_get_free_channel+0x3d>
ffff82d08019e1d4: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e1d9: eb 13 jmp ffff82d08019e1ee
<hpet_get_free_channel+0x9e>
ffff82d08019e1db: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e1e0: eb 0c jmp ffff82d08019e1ee
<hpet_get_free_channel+0x9e>
ffff82d08019e1e2: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e1e7: eb 05 jmp ffff82d08019e1ee
<hpet_get_free_channel+0x9e>
ffff82d08019e1e9: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e1ee: 5d pop %rbp
ffff82d08019e1ef: c3 retq
With __builtin_ffs:
ffff82d08019e4a5 <hpet_get_free_channel>:
ffff82d08019e4a5: 55 push %rbp
ffff82d08019e4a6: 48 89 e5 mov %rsp,%rbp
ffff82d08019e4a9: 8b 15 c9 4f 0d 00 mov 0xd4fc9(%rip),%edx
# ffff82d080273478 <num_hpets_used>
ffff82d08019e4af: 85 d2 test %edx,%edx
ffff82d08019e4b1: 74 4a je ffff82d08019e4fd
<hpet_get_free_channel+0x58>
ffff82d08019e4b3: be ff ff ff ff mov $0xffffffff,%esi
ffff82d08019e4b8: 0f bc 05 11 7f 17 00 bsf 0x177f11(%rip),%eax
# ffff82d0803163d0 <free_channels>
ffff82d08019e4bf: 0f 44 c6 cmove %esi,%eax
ffff82d08019e4c2: 83 c0 01 add $0x1,%eax
ffff82d08019e4c5: 74 3d je ffff82d08019e504
<hpet_get_free_channel+0x5f>
ffff82d08019e4c7: 83 e8 01 sub $0x1,%eax
ffff82d08019e4ca: 3b 05 a8 4f 0d 00 cmp 0xd4fa8(%rip),%eax
# ffff82d080273478 <num_hpets_used>
ffff82d08019e4d0: 72 02 jb ffff82d08019e4d4
<hpet_get_free_channel+0x2f>
ffff82d08019e4d2: 0f 0b ud2
ffff82d08019e4d4: f0 0f b3 05 f4 7e 17 lock btr %eax,0x177ef4(%rip)
# ffff82d0803163d0 <free_channels>
ffff82d08019e4db: 00
ffff82d08019e4dc: 19 c9 sbb %ecx,%ecx
ffff82d08019e4de: 85 c9 test %ecx,%ecx
ffff82d08019e4e0: 74 0f je ffff82d08019e4f1
<hpet_get_free_channel+0x4c>
ffff82d08019e4e2: 89 c0 mov %eax,%eax
ffff82d08019e4e4: 48 c1 e0 07 shl $0x7,%rax
ffff82d08019e4e8: 48 03 05 91 4f 0d 00 add 0xd4f91(%rip),%rax
# ffff82d080273480 <hpet_events>
ffff82d08019e4ef: eb 18 jmp ffff82d08019e509
<hpet_get_free_channel+0x64>
ffff82d08019e4f1: 83 ea 01 sub $0x1,%edx
ffff82d08019e4f4: 75 c2 jne ffff82d08019e4b8
<hpet_get_free_channel+0x13>
ffff82d08019e4f6: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e4fb: eb 0c jmp ffff82d08019e509
<hpet_get_free_channel+0x64>
ffff82d08019e4fd: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e502: eb 05 jmp ffff82d08019e509
<hpet_get_free_channel+0x64>
ffff82d08019e504: b8 00 00 00 00 mov $0x0,%eax
ffff82d08019e509: 5d pop %rbp
ffff82d08019e50a: c3 retq