On Wed, Oct 10, 2018 at 01:54:33PM -0500, Segher Boessenkool wrote:> It would be great to hear from kernel people if it works adequately for > what you guys want it for :-)Sure, ping me when you have the final version and I'll try to build gcc with it and do some size comparisons. Thx. -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
Ok, with Segher's help I've been playing with his patch ontop of bleeding edge gcc 9 and here are my observations. Please double-check me for booboos so that they can be addressed while there's time. So here's what I see ontop of 4.19-rc7: First marked the alternative asm() as inline and undeffed the "inline" keyword. I need to do that because of the funky games we do with "inline" redefinitions in include/linux/compiler_types.h. And Segher hinted at either doing: asm volatile inline(... or asm volatile __inline__(... but both "inline" variants are defined as macros in that file. Which means we either need to #undef inline before using it in asm() or come up with something cleverer. Anyway, did this: --- diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4cd6a3b71824..7c0639087da7 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -165,11 +165,13 @@ static inline int alternatives_text_reserved(void *start, void *end) * For non barrier like inlines please define new variants * without volatile and memory clobber. */ + +#undef inline #define alternative(oldinstr, newinstr, feature) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") + asm volatile inline(ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ - asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + asm volatile inline(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") /* * Alternative inline assembly with input. --- Build failed at link time with: arch/x86/boot/compressed/cmdline.o: In function `native_save_fl': cmdline.c:(.text+0x0): multiple definition of `native_save_fl' arch/x86/boot/compressed/misc.o:misc.c:(.text+0x1e0): first defined here arch/x86/boot/compressed/cmdline.o: In function `native_restore_fl': cmdline.c:(.text+0x10): multiple definition of `native_restore_fl' arch/x86/boot/compressed/misc.o:misc.c:(.text+0x1f0): first defined here arch/x86/boot/compressed/error.o: In function `native_save_fl': error.c:(.text+0x0): multiple definition of `native_save_fl' which I had to fix with this: --- diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 15450a675031..0d772598c37c 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -14,8 +14,7 @@ */ /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ -extern inline unsigned long native_save_fl(void); -extern inline unsigned long native_save_fl(void) +static inline unsigned long native_save_fl(void) { unsigned long flags; @@ -33,8 +32,7 @@ ex --- That "extern inline" declaration looks fishy to me anyway, maybe not really needed ? Apparently, gcc < 4.9 complains with -Werror=missing-prototypes... Then the build worked and the results look like this: text data bss dec hex filename 17287384 5040656 2019532 24347572 17383b4 vmlinux-before 17288020 5040656 2015436 24344112 1737630 vmlinux-2nd-version so some inlining must be happening. Then I did this: --- diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 9c5606d88f61..a0170344cf08 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -20,7 +20,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) /* no memory constraint because it doesn't change any memory gcc knows about */ stac(); - asm volatile( + asm volatile inline( " testq %[size8],%[size8]\n" " jz 4f\n" "0: movq $0,(%[dst])\n" --- to force inlining of a somewhat bigger asm() statement. And yap, more got inlined: text data bss dec hex filename 17287384 5040656 2019532 24347572 17383b4 vmlinux-before 17288020 5040656 2015436 24344112 1737630 vmlinux-2nd-version 17288076 5040656 2015436 24344168 1737668 vmlinux-2nd-version__clear_user so more stuff gets inlined. Looking at the asm output, it had before: --- clear_user: # ./arch/x86/include/asm/current.h:15: return this_cpu_read_stable(current_task); #APP # 15 "./arch/x86/include/asm/current.h" 1 movq %gs:current_task,%rax #, pfo_ret__ # 0 "" 2 # arch/x86/lib/usercopy_64.c:51: if (access_ok(VERIFY_WRITE, to, n)) #NO_APP movq 2520(%rax), %rdx # pfo_ret___12->thread.addr_limit.seg, _1 movq %rdi, %rax # to, tmp93 addq %rsi, %rax # n, tmp93 jc .L3 #, # arch/x86/lib/usercopy_64.c:51: if (access_ok(VERIFY_WRITE, to, n)) cmpq %rax, %rdx # tmp93, _1 jb .L3 #, # arch/x86/lib/usercopy_64.c:52: return __clear_user(to, n); jmp __clear_user # --- note the JMP to __clear_user. After marking the asm() in __clear_user() as inline, clear_user() inlines __clear_user() directly: --- clear_user: # ./arch/x86/include/asm/current.h:15: return this_cpu_read_stable(current_task); #APP # 15 "./arch/x86/include/asm/current.h" 1 movq %gs:current_task,%rax #, pfo_ret__ # 0 "" 2 # arch/x86/lib/usercopy_64.c:51: if (access_ok(VERIFY_WRITE, to, n)) #NO_APP movq 2520(%rax), %rdx # pfo_ret___12->thread.addr_limit.seg, _1 movq %rdi, %rax # to, tmp95 addq %rsi, %rax # n, tmp95 jc .L8 #, # arch/x86/lib/usercopy_64.c:51: if (access_ok(VERIFY_WRITE, to, n)) cmpq %rax, %rdx # tmp95, _1 jb .L8 #, # ./arch/x86/include/asm/smap.h:58: alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); ... this last line is the stac() macro which gets inlined due to the alternative() inlined macro above. So I guess this all looks like what we wanted. And if this lands in gcc9, we would need to do a asm_volatile() macro which is defined differently based on the compiler used. Thoughts, suggestions, etc are most welcome. Thx. -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
On Sun, Oct 14, 2018 at 12:14:02AM +0300, Alexander Monakov wrote:> I apologize for coming in late here with an alternative proposal, but would > you be happy if GCC gave you a way to designate a portion of the asm template > string that shouldn't be counted as its cost because it doesn't go into the > .text section? This wouldn't interact with your redefinitions of the inline > keyword, and you could do something like (assuming we go with %` ... %` > delimiters)I don't mind it but I see you guys are still discussing what would be the better solution here, on the gcc ML. And we, as one user, are a happy camper as long as it does what it is meant to do. But how the feature looks like syntactically is something for gcc folk to decide as they're going to support it for the foreseeable future and I'm very well aware of how important it is for a supportable feature to be designed properly. Thx. -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
On Wed, 2018-10-31 at 13:55 +0100, Peter Zijlstra wrote:> > Anyway, with the below patch, I get: > > text data bss dec hex filename > 17385183 5064780 1953892 24403855 1745f8f defconfig-build/vmlinux > 17385678 5064780 1953892 24404350 174617e defconfig-build/vmlinux > > Which shows we inline more (look for asm_volatile for the actual > changes).[]> scripts/checkpatch.pl | 8 ++--- > scripts/genksyms/keywords.c | 4 +-- > scripts/kernel-doc | 4 +--I believe these should be excluded from the conversions. Other than that, generic conversion by script seems a good idea.
On Wed, Oct 31, 2018 at 10:20:00PM -0700, Joe Perches wrote:> On Wed, 2018-10-31 at 13:55 +0100, Peter Zijlstra wrote: > > > > Anyway, with the below patch, I get: > > > > text data bss dec hex filename > > 17385183 5064780 1953892 24403855 1745f8f defconfig-build/vmlinux > > 17385678 5064780 1953892 24404350 174617e defconfig-build/vmlinux > > > > Which shows we inline more (look for asm_volatile for the actual > > changes). > [] > > scripts/checkpatch.pl | 8 ++--- > > scripts/genksyms/keywords.c | 4 +-- > > scripts/kernel-doc | 4 +-- > > I believe these should be excluded from the conversions.Probably, yes. It compiled, which was all I cared about :-) BTW, if we do that conversion, we should upgrade the checkpatch warn to an error I suppose.
Hi Peter, On Wed, Oct 31, 2018 at 9:58 PM Peter Zijlstra <peterz at infradead.org> wrote:> > On Sat, Oct 13, 2018 at 09:33:35PM +0200, Borislav Petkov wrote: > > Ok, > > > > with Segher's help I've been playing with his patch ontop of bleeding > > edge gcc 9 and here are my observations. Please double-check me for > > booboos so that they can be addressed while there's time. > > > > So here's what I see ontop of 4.19-rc7: > > > > First marked the alternative asm() as inline and undeffed the "inline" > > keyword. I need to do that because of the funky games we do with > > "inline" redefinitions in include/linux/compiler_types.h. > > > > And Segher hinted at either doing: > > > > asm volatile inline(... > > > > or > > > > asm volatile __inline__(... > > > > but both "inline" variants are defined as macros in that file. > > > > Which means we either need to #undef inline before using it in asm() or > > come up with something cleverer. > > # git grep -e "\<__inline__\>" | wc -l > 488 > # git grep -e "\<__inline\>" | wc -l > 56 > # git grep -e "\<inline\>" | wc -l > 69598 > > And we already have scripts/checkpatch.pl: > > # Check for __inline__ and __inline, prefer inline > > Which suggests we do: > > git grep -l "\<__inline\(\|__\)\>" | while read file > do > sed -i -e 's/\<__inline\(\|__\)\>/inline/g' $file > done > > and get it over with.Do you have a plan to really do this? This is a nice cleanup anyway. I think the last minute of MW is a good timing for the global replacement like this.> > Anyway, with the below patch, I get: > > text data bss dec hex filename > 17385183 5064780 1953892 24403855 1745f8f defconfig-build/vmlinux > 17385678 5064780 1953892 24404350 174617e defconfig-build/vmlinux > > Which shows we inline more (look for asm_volatile for the actual > changes). > > > So yes, this seems like something we could work with. > > --- > Documentation/trace/tracepoint-analysis.rst | 2 +- > Documentation/translations/ja_JP/SubmittingPatches | 4 +-- > Documentation/translations/zh_CN/SubmittingPatches | 4 +-- > arch/alpha/include/asm/atomic.h | 12 +++---- > arch/alpha/include/asm/bitops.h | 6 ++-- > arch/alpha/include/asm/compiler.h | 4 +-- > arch/alpha/include/asm/dma.h | 22 ++++++------ > arch/alpha/include/asm/floppy.h | 4 +-- > arch/alpha/include/asm/irq.h | 2 +- > arch/alpha/include/asm/local.h | 4 +-- > arch/alpha/include/asm/smp.h | 2 +- > arch/arm/mach-iop32x/include/mach/uncompress.h | 2 +- > arch/arm/mach-iop33x/include/mach/uncompress.h | 2 +- > arch/arm/mach-ixp4xx/include/mach/uncompress.h | 2 +- > arch/ia64/hp/common/sba_iommu.c | 2 +- > arch/ia64/hp/sim/simeth.c | 2 +- > arch/ia64/include/asm/atomic.h | 8 ++--- > arch/ia64/include/asm/bitops.h | 34 +++++++++--------- > arch/ia64/include/asm/delay.h | 14 ++++---- > arch/ia64/include/asm/irq.h | 2 +- > arch/ia64/include/asm/page.h | 2 +- > arch/ia64/include/asm/sn/leds.h | 2 +- > arch/ia64/include/asm/uaccess.h | 4 +-- > arch/ia64/include/uapi/asm/rse.h | 12 +++---- > arch/ia64/include/uapi/asm/swab.h | 6 ++-- > arch/ia64/oprofile/backtrace.c | 4 +-- > arch/m68k/include/asm/blinken.h | 2 +- > arch/m68k/include/asm/checksum.h | 2 +- > arch/m68k/include/asm/dma.h | 32 ++++++++--------- > arch/m68k/include/asm/floppy.h | 8 ++--- > arch/m68k/include/asm/nettel.h | 8 ++--- > arch/m68k/mac/iop.c | 14 ++++---- > arch/mips/include/asm/atomic.h | 16 ++++----- > arch/mips/include/asm/checksum.h | 2 +- > arch/mips/include/asm/dma.h | 20 +++++------ > arch/mips/include/asm/jazz.h | 2 +- > arch/mips/include/asm/local.h | 4 +-- > arch/mips/include/asm/string.h | 8 ++--- > arch/mips/kernel/binfmt_elfn32.c | 2 +- > arch/nds32/include/asm/swab.h | 4 +-- > arch/parisc/include/asm/atomic.h | 20 +++++------ > arch/parisc/include/asm/bitops.h | 18 +++++----- > arch/parisc/include/asm/checksum.h | 4 +-- > arch/parisc/include/asm/compat.h | 2 +- > arch/parisc/include/asm/delay.h | 2 +- > arch/parisc/include/asm/dma.h | 20 +++++------ > arch/parisc/include/asm/ide.h | 8 ++--- > arch/parisc/include/asm/irq.h | 2 +- > arch/parisc/include/asm/spinlock.h | 12 +++---- > arch/powerpc/include/asm/atomic.h | 40 +++++++++++----------- > arch/powerpc/include/asm/bitops.h | 28 +++++++-------- > arch/powerpc/include/asm/dma.h | 20 +++++------ > arch/powerpc/include/asm/edac.h | 2 +- > arch/powerpc/include/asm/irq.h | 2 +- > arch/powerpc/include/asm/local.h | 14 ++++---- > arch/sh/include/asm/pgtable_64.h | 2 +- > arch/sh/include/asm/processor_32.h | 4 +-- > arch/sh/include/cpu-sh3/cpu/dac.h | 6 ++-- > arch/x86/include/asm/alternative.h | 14 ++++---- > arch/x86/um/asm/checksum.h | 4 +-- > arch/x86/um/asm/checksum_32.h | 4 +-- > arch/xtensa/include/asm/checksum.h | 14 ++++---- > arch/xtensa/include/asm/cmpxchg.h | 4 +-- > arch/xtensa/include/asm/irq.h | 2 +- > block/partitions/amiga.c | 2 +- > drivers/atm/he.c | 6 ++-- > drivers/atm/idt77252.c | 6 ++-- > drivers/gpu/drm/mga/mga_drv.h | 2 +- > drivers/gpu/drm/mga/mga_state.c | 14 ++++---- > drivers/gpu/drm/r128/r128_drv.h | 2 +- > drivers/gpu/drm/r128/r128_state.c | 14 ++++---- > drivers/gpu/drm/via/via_irq.c | 2 +- > drivers/gpu/drm/via/via_verifier.c | 30 ++++++++-------- > drivers/isdn/hardware/eicon/platform.h | 14 ++++---- > drivers/isdn/i4l/isdn_net.c | 14 ++++---- > drivers/isdn/i4l/isdn_net.h | 8 ++--- > drivers/media/pci/ivtv/ivtv-ioctl.c | 2 +- > drivers/net/ethernet/sun/sungem.c | 8 ++--- > drivers/net/ethernet/sun/sunhme.c | 6 ++-- > drivers/net/hamradio/baycom_ser_fdx.c | 2 +- > drivers/net/wan/lapbether.c | 2 +- > drivers/net/wan/n2.c | 4 +-- > drivers/parisc/led.c | 4 +-- > drivers/parisc/sba_iommu.c | 2 +- > drivers/parport/parport_gsc.c | 2 +- > drivers/parport/parport_gsc.h | 4 +-- > drivers/parport/parport_pc.c | 2 +- > drivers/scsi/lpfc/lpfc_scsi.c | 2 +- > drivers/scsi/pcmcia/sym53c500_cs.c | 4 +-- > drivers/scsi/qla2xxx/qla_inline.h | 2 +- > drivers/scsi/qla2xxx/qla_os.c | 4 +-- > drivers/staging/rtl8723bs/core/rtw_pwrctrl.c | 4 +-- > drivers/staging/rtl8723bs/core/rtw_wlan_util.c | 2 +- > drivers/staging/rtl8723bs/include/drv_types.h | 6 ++-- > drivers/staging/rtl8723bs/include/ieee80211.h | 6 ++-- > drivers/staging/rtl8723bs/include/osdep_service.h | 10 +++--- > .../rtl8723bs/include/osdep_service_linux.h | 14 ++++---- > drivers/staging/rtl8723bs/include/rtw_mlme.h | 14 ++++---- > drivers/staging/rtl8723bs/include/rtw_recv.h | 16 ++++----- > drivers/staging/rtl8723bs/include/sta_info.h | 2 +- > drivers/staging/rtl8723bs/include/wifi.h | 14 ++++---- > drivers/staging/rtl8723bs/include/wlan_bssdef.h | 2 +- > drivers/tty/amiserial.c | 2 +- > drivers/tty/serial/ip22zilog.c | 2 +- > drivers/tty/serial/sunsab.c | 4 +-- > drivers/tty/serial/sunzilog.c | 2 +- > drivers/video/fbdev/core/fbcon.c | 20 +++++------ > drivers/video/fbdev/ffb.c | 2 +- > drivers/video/fbdev/intelfb/intelfbdrv.c | 10 +++--- > drivers/video/fbdev/intelfb/intelfbhw.c | 2 +- > drivers/w1/masters/matrox_w1.c | 4 +-- > fs/coda/coda_linux.h | 6 ++-- > fs/freevxfs/vxfs_inode.c | 2 +- > fs/nfsd/nfsfh.h | 4 +-- > include/acpi/platform/acgcc.h | 2 +- > include/acpi/platform/acintel.h | 2 +- > include/asm-generic/ide_iops.h | 8 ++--- > include/linux/atalk.h | 4 +-- > include/linux/ceph/messenger.h | 2 +- > include/linux/compiler_types.h | 4 +-- > include/linux/hdlc.h | 4 +-- > include/linux/inetdevice.h | 8 ++--- > include/linux/parport.h | 4 +-- > include/linux/parport_pc.h | 22 ++++++------ > include/net/ax25.h | 2 +- > include/net/checksum.h | 2 +- > include/net/dn_nsp.h | 16 ++++----- > include/net/ip.h | 2 +- > include/net/ip6_checksum.h | 2 +- > include/net/ipx.h | 10 +++--- > include/net/llc_c_ev.h | 4 +-- > include/net/llc_conn.h | 4 +-- > include/net/llc_s_ev.h | 2 +- > include/net/netrom.h | 8 ++--- > include/net/scm.h | 14 ++++---- > include/net/udplite.h | 2 +- > include/net/x25.h | 8 ++--- > include/net/xfrm.h | 18 +++++----- > include/uapi/linux/atm.h | 4 +-- > include/uapi/linux/atmsap.h | 2 +- > include/uapi/linux/map_to_7segment.h | 2 +- > include/uapi/linux/netfilter_arp/arp_tables.h | 2 +- > include/uapi/linux/netfilter_bridge/ebtables.h | 2 +- > include/uapi/linux/netfilter_ipv4/ip_tables.h | 2 +- > include/uapi/linux/netfilter_ipv6/ip6_tables.h | 2 +- > include/video/newport.h | 12 +++---- > lib/zstd/mem.h | 2 +- > net/appletalk/atalk_proc.c | 4 +-- > net/appletalk/ddp.c | 2 +- > net/core/neighbour.c | 2 +- > net/core/scm.c | 2 +- > net/decnet/dn_nsp_in.c | 2 +- > net/decnet/dn_nsp_out.c | 2 +- > net/decnet/dn_route.c | 2 +- > net/decnet/dn_table.c | 4 +-- > net/ipv4/igmp.c | 2 +- > net/ipv6/af_inet6.c | 2 +- > net/ipv6/icmp.c | 4 +-- > net/ipv6/udp.c | 4 +-- > net/lapb/lapb_iface.c | 4 +-- > net/llc/llc_input.c | 2 +- > scripts/checkpatch.pl | 8 ++--- > scripts/genksyms/keywords.c | 4 +-- > scripts/kernel-doc | 4 +-- > sound/sparc/amd7930.c | 6 ++-- > 165 files changed, 547 insertions(+), 547 deletions(-)-- Best Regards Masahiro Yamada
Possibly Parallel Threads
- PROPOSAL: Extend inline asm syntax with size spec
- PROPOSAL: Extend inline asm syntax with size spec
- PROPOSAL: Extend inline asm syntax with size spec
- [PATCH 00/13] x86/paravirt: Make pv ops code generation more closely match reality
- [PATCH 00/13] x86/paravirt: Make pv ops code generation more closely match reality