thr3ads.net - Xen devel - [PATCH] linux-2.6.18/x86-64: provide a memset() that can deal with 4Gb or above at a time [Mar 2012]

If this information is useful, please help other people find it:
Share via:

Jan Beulich

2012-Mar-23 11:19 UTC

[PATCH] linux-2.6.18/x86-64: provide a memset() that can deal with 4Gb or above at a time

Now that a corresponding change got accepted into Linux 3.4, let''s fix
this in our code too. It is particularly required by the memset()
invoked from __alloc_bootmem_core(), which can be called with sizes
beyond 4Gb out of alloc_node_mem_map() when CONFIG_FLAT_NODE_MEM_MAP is
defined (starting at around 300Gb).

In order to not affect the native kernel (which is unlikely to be
affected anyway, as it usually sets up separate maps for each node [as
long as NUMA is defined], and hence would require said amount of memory
per node [and SPARSEMEM not to be used] for the problem to become
visible, plus in this tree we''re not really concerned about fixing
native problems), introduce a Xen-specific clone of the original file.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- /dev/null
+++ b/arch/x86_64/lib/memset-xen.S
@@ -0,0 +1,122 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+	.globl __memset
+	.globl memset
+	.p2align 4
+memset:
+__memset:
+	movq %rdi,%r10
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	imulq  %rcx,%rax
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decq  %rcx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%edx,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	andl	$7,%edx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %edx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%rdx
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%rdx
+	jmp .Lafter_bad_alignment
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */
+memset_c:
+	movq %rdi,%r9
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	imulq %rsi,%rax
+	rep
+	stosq
+	movl %edx,%ecx
+	rep
+	stosb
+	movq %r9,%rax
+	ret
+memset_c_end:
+	.previous





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Xen devel - Mar 2012 - [PATCH] linux-2.6.18/x86-64: provide a memset() that can deal with 4Gb or above at a time

[PATCH] linux-2.6.18/x86-64: provide a memset() that can deal with 4Gb or above at a time