Jan Beulich
2007-Jun-19 10:16 UTC
[Xen-devel] [PATCH] x86: introduce specialized clear_page()
Both IA64 and PowerPC have specialized (non-memset) versions for this, so add one more than doubling performance of page clearing on not too old processors (SSE2 supported). While the patch also adds an SSE version, this is is currently orphaned as I am not certain about the benefit of special casing idle VCPUs in a few places (during context switching), so that at least in that context using %xmmN registers would be possible without crashing and/or corrupting guest state. The benefit of adding such support could be to reduce scheduling latency when a VCPU is to transition out of idle, but is busy doing page cleaning. Signed-off-by: Jan Beulich <jbeulich@novell.com> Index: 2007-06-18/xen/arch/x86/domain.c ==================================================================--- 2007-06-18.orig/xen/arch/x86/domain.c 2007-06-04 08:35:35.000000000 +0200 +++ 2007-06-18/xen/arch/x86/domain.c 2007-06-18 11:57:46.000000000 +0200 @@ -151,7 +151,8 @@ int setup_arg_xlat_area(struct vcpu *v, pg = alloc_domheap_page(NULL); if ( !pg ) return -ENOMEM; - d->arch.mm_arg_xlat_l3 = clear_page(page_to_virt(pg)); + d->arch.mm_arg_xlat_l3 = page_to_virt(pg); + clear_page(d->arch.mm_arg_xlat_l3); } l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] @@ -429,7 +430,8 @@ int arch_domain_create(struct domain *d) if ( (pg = alloc_domheap_page(NULL)) == NULL ) goto fail; - d->arch.mm_perdomain_l2 = clear_page(page_to_virt(pg)); + d->arch.mm_perdomain_l2 = page_to_virt(pg); + clear_page(d->arch.mm_perdomain_l2); for ( i = 0; i < (1 << pdpt_order); i++ ) d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i, @@ -437,7 +439,8 @@ int arch_domain_create(struct domain *d) if ( (pg = alloc_domheap_page(NULL)) == NULL ) goto fail; - d->arch.mm_perdomain_l3 = clear_page(page_to_virt(pg)); + d->arch.mm_perdomain_l3 = page_to_virt(pg); + clear_page(d->arch.mm_perdomain_l3); d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2), __PAGE_HYPERVISOR); Index: 2007-06-18/xen/arch/x86/x86_32/Makefile ==================================================================--- 2007-06-18.orig/xen/arch/x86/x86_32/Makefile 2006-11-14 13:51:10.000000000 +0100 +++ 2007-06-18/xen/arch/x86/x86_32/Makefile 2007-06-18 11:57:46.000000000 +0200 @@ -1,3 +1,4 @@ +obj-y += clear_page.o obj-y += domain_page.o obj-y += entry.o obj-y += gpr_switch.o Index: 2007-06-18/xen/arch/x86/x86_32/clear_page.S ==================================================================--- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2007-06-18/xen/arch/x86/x86_32/clear_page.S 2007-06-18 11:57:46.000000000 +0200 @@ -0,0 +1,36 @@ +#include <xen/config.h> +#include <asm/page.h> + +ENTRY(clear_page_xmm) + movl 4(%esp), %edx + movl $PAGE_SIZE / 64, %ecx + xorps %xmm0, %xmm0 + +0: + decl %ecx + movntps %xmm0, (%edx) + movntps %xmm0, 16(%edx) + movntps %xmm0, 32(%edx) + movntps %xmm0, 48(%edx) + leal 64(%edx), %edx + jnz 0b + + sfence + ret + +ENTRY(clear_page_sse2) + movl 4(%esp), %edx + movl $PAGE_SIZE / 16, %ecx + xorl %eax, %eax + +0: + decl %ecx + movnti %eax, (%edx) + movnti %eax, 4(%edx) + movnti %eax, 8(%edx) + movnti %eax, 12(%edx) + leal 16(%edx), %edx + jnz 0b + + sfence + ret Index: 2007-06-18/xen/arch/x86/x86_64/Makefile ==================================================================--- 2007-06-18.orig/xen/arch/x86/x86_64/Makefile 2007-02-12 14:00:54.000000000 +0100 +++ 2007-06-18/xen/arch/x86/x86_64/Makefile 2007-06-18 11:57:46.000000000 +0200 @@ -1,12 +1,13 @@ subdir-y += compat +obj-y += clear_page.o obj-y += entry.o -obj-y += compat_kexec.o obj-y += gpr_switch.o obj-y += mm.o obj-y += traps.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_COMPAT) += compat_kexec.o obj-$(CONFIG_COMPAT) += domain.o obj-$(CONFIG_COMPAT) += physdev.o obj-$(CONFIG_COMPAT) += platform_hypercall.o Index: 2007-06-18/xen/arch/x86/x86_64/clear_page.S ==================================================================--- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2007-06-18/xen/arch/x86/x86_64/clear_page.S 2007-06-18 11:57:46.000000000 +0200 @@ -0,0 +1,34 @@ +#include <xen/config.h> +#include <asm/page.h> + +ENTRY(clear_page_xmm) + movl $PAGE_SIZE / 64, %ecx + xorps %xmm0, %xmm0 + +0: + decl %ecx + movntps %xmm0, (%rdi) + movntps %xmm0, 16(%rdi) + movntps %xmm0, 32(%rdi) + movntps %xmm0, 48(%rdi) + leaq 64(%rdi), %rdi + jnz 0b + + sfence + ret + +ENTRY(clear_page_sse2) + movl $PAGE_SIZE / 32, %ecx + xorl %eax, %eax + +0: + decl %ecx + movnti %rax, (%rdi) + movnti %rax, 8(%rdi) + movnti %rax, 16(%rdi) + movnti %rax, 24(%rdi) + leaq 32(%rdi), %rdi + jnz 0b + + sfence + ret Index: 2007-06-18/xen/arch/x86/x86_64/mm.c ==================================================================--- 2007-06-18.orig/xen/arch/x86/x86_64/mm.c 2007-06-04 08:35:35.000000000 +0200 +++ 2007-06-18/xen/arch/x86/x86_64/mm.c 2007-06-18 11:57:46.000000000 +0200 @@ -106,7 +106,8 @@ void __init paging_init(void) /* Create user-accessible L2 directory to map the MPT for guests. */ if ( (l2_pg = alloc_domheap_page(NULL)) == NULL ) goto nomem; - l3_ro_mpt = clear_page(page_to_virt(l2_pg)); + l3_ro_mpt = page_to_virt(l2_pg); + clear_page(l3_ro_mpt); l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)], l4e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER)); @@ -132,7 +133,8 @@ void __init paging_init(void) if ( (l2_pg = alloc_domheap_page(NULL)) == NULL ) goto nomem; va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT); - l2_ro_mpt = clear_page(page_to_virt(l2_pg)); + l2_ro_mpt = page_to_virt(l2_pg); + clear_page(l2_ro_mpt); l3e_write(&l3_ro_mpt[l3_table_offset(va)], l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER)); l2_ro_mpt += l2_table_offset(va); @@ -152,7 +154,8 @@ void __init paging_init(void) l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]); if ( (l2_pg = alloc_domheap_page(NULL)) == NULL ) goto nomem; - compat_idle_pg_table_l2 = l2_ro_mpt = clear_page(page_to_virt(l2_pg)); + compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg); + clear_page(l2_ro_mpt); l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)], l3e_from_page(l2_pg, __PAGE_HYPERVISOR)); l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START); Index: 2007-06-18/xen/include/asm-x86/page.h ==================================================================--- 2007-06-18.orig/xen/include/asm-x86/page.h 2007-06-04 08:35:36.000000000 +0200 +++ 2007-06-18/xen/include/asm-x86/page.h 2007-06-18 11:57:46.000000000 +0200 @@ -214,9 +214,12 @@ typedef struct { u64 pfn; } pagetable_t; #define pagetable_from_page(pg) pagetable_from_pfn(page_to_mfn(pg)) #define pagetable_from_paddr(p) pagetable_from_pfn((p)>>PAGE_SHIFT) #define pagetable_null() pagetable_from_pfn(0) -#endif -#define clear_page(_p) memset((void *)(_p), 0, PAGE_SIZE) +extern void clear_page_xmm(void *); +extern void clear_page_sse2(void *); +#define clear_page(_p) (cpu_has_xmm2 ? \ + clear_page_sse2((void *)(_p)) : \ + memset((void *)(_p), 0, PAGE_SIZE)) #define copy_page(_t,_f) memcpy((void *)(_t), (void *)(_f), PAGE_SIZE) #define mfn_valid(mfn) ((mfn) < max_page) @@ -244,6 +247,7 @@ typedef struct { u64 pfn; } pagetable_t; /* Convert between frame number and address formats. */ #define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT) #define paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT)) +#endif /* High table entries are reserved by the hypervisor. */ #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2007-Jun-20 15:16 UTC
Re: [Xen-devel] [PATCH] x86: introduce specialized clear_page()
On 19/6/07 11:16, "Jan Beulich" <jbeulich@novell.com> wrote:> While the patch also adds an SSE version, this is is currently orphaned > as I am not certain about the benefit of special casing idle VCPUs in > a few places (during context switching), so that at least in that > context using %xmmN registers would be possible without crashing and/or > corrupting guest state. The benefit of adding such support could be to > reduce scheduling latency when a VCPU is to transition out of idle, but > is busy doing page cleaning.I measure no benefit from the XMM version (nor from upgrading the SSE2 version on x86/64 to clear from %rax instead of %eax). I guess this is because the move is non-temporal and it''s actually quite easy to max the memory bandwidth. So I stripped out the xmm version (it can''t work anyway without some kernel_fpu_begin/end kind of mechanism) and made the sse2 version common across i386 and x86_64 since that was easily done and increases code sharing. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2007-Jun-20 15:35 UTC
Re: [Xen-devel] [PATCH] x86: introduce specialized clear_page()
>>> Keir Fraser <keir@xensource.com> 20.06.07 17:16 >>> >On 19/6/07 11:16, "Jan Beulich" <jbeulich@novell.com> wrote: > >> While the patch also adds an SSE version, this is is currently orphaned >> as I am not certain about the benefit of special casing idle VCPUs in >> a few places (during context switching), so that at least in that >> context using %xmmN registers would be possible without crashing and/or >> corrupting guest state. The benefit of adding such support could be to >> reduce scheduling latency when a VCPU is to transition out of idle, but >> is busy doing page cleaning. > >I measure no benefit from the XMM version (nor from upgrading the SSE2 >version on x86/64 to clear from %rax instead of %eax). I guess this is >because the move is non-temporal and it''s actually quite easy to max the >memory bandwidth.Neither did I, except (obviously) on machines having SSE, but not SSE2 (which was what I considered applying this for, with the additional need to tweak idle vcpu handling so that in that context [where FP context altering doesn''t matter, as long as the rest of the system''s aware of it] it could then be used).>So I stripped out the xmm version (it can''t work anyway without some >kernel_fpu_begin/end kind of mechanism) and made the sse2 version common >across i386 and x86_64 since that was easily done and increases code >sharing.Fine by me - I tried to say this in the description. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2007-Jun-20 15:53 UTC
Re: [Xen-devel] [PATCH] x86: introduce specialized clear_page()
On 20/6/07 16:35, "Jan Beulich" <jbeulich@novell.com> wrote:>> I measure no benefit from the XMM version (nor from upgrading the SSE2 >> version on x86/64 to clear from %rax instead of %eax). I guess this is >> because the move is non-temporal and it''s actually quite easy to max the >> memory bandwidth. > > Neither did I, except (obviously) on machines having SSE, but not SSE2 (which > was what I considered applying this for, with the additional need to tweak > idle vcpu handling so that in that context [where FP context altering doesn''t > matter, as long as the rest of the system''s aware of it] it could then be > used).Oh, yes. Well, that''s not a very interesting set of CPUs to optimise for. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel