thr3ads.net - Btrfs devel - [PATCH] x86_64/lib: improve the performance of memmove [Sep 2010]

If this information is useful, please help other people find it:
Share via:

Miao Xie

2010-Sep-16 06:31 UTC

[PATCH] x86_64/lib: improve the performance of memmove

When the dest and the src do overlap and the memory area is large, memmove of
x86_64 is very inefficient, and it led to bad performance, such as
btrfs''s file
deletion performance. This patch improved the performance of memmove on x86_64
by using __memcpy_bwd() instead of byte copy when doing large memory area copy
(len > 64).

I have tested this patchset by doing 500 bytes memory copy for 50000 times
with various alignments and buffer sizes on my x86_64 box:
Len	Src Unalign	Dest Unalign	Without Patch	Patch applied
---	-----------	------------	-------------	------------- 
256	0		0		0s 815158us	0s 249647us
256	0		4		0s 816059us	0s 324210us
256	0		7		0s 815192us	0s 324254us
256	3		0		0s 815179us	0s 325991us
256	3		1		0s 815161us	0s 378462us
256	3		4		0s 815154us	0s 779306us
256	3		7		0s 815151us	0s 782924us
256	7		0		0s 815839us	0s 325524us
256	7		4		0s 815149us	0s 375658us
256	7		7		0s 815160us	0s 374488us
1024	0		0		3s 125891us	0s 437662us
1024	0		1		3s 125940us	0s 777524us
1024	0		4		3s 159788us	0s 778850us
1024	0		7		3s 155177us	0s 733927us
1024	4		0		3s 118323us	0s 830167us
1024	4		4		3s 129124us	0s 962505us
1024	4		7		3s 123456us	2s 600326us

After appling this patchset, the performance of the file creation and deletion
on some filesystem become better. I have tested it with the following benchmark
tool on my x86_64 box.
  http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3

Test steps:
# ./creat_unlink 50000

The result(Total time):
Ext4:
		2.6.36-rc4	2.6.36-rc4 + patch
file creation	0.737007	0.701888		4.8%UP
file deletion	0.422226	0.413457		2.1%UP

Btrfs:
		2.6.36-rc4	2.6.36-rc4 + patch
file creation	0.977638	0.935208		4.3%UP
file deletion	1.327140	1.221073		8%UP

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 arch/x86/include/asm/string_64.h |    1 +
 arch/x86/lib/Makefile            |    2 +-
 arch/x86/lib/memcpy_bwd_64.S     |  137 ++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/memmove_64.c        |   10 ++-
 4 files changed, 145 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/lib/memcpy_bwd_64.S

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c46..4e64a87 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -55,6 +55,7 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 void *memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMMOVE
+extern void *__memcpy_bwd(void *dest, const void *src, size_t count);
 void *memmove(void *dest, const void *src, size_t count);
 
 int memcmp(const void *cs, const void *ct, size_t count);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf07..ab241df 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
-lib-y += memcpy_$(BITS).o
+lib-y += memcpy_$(BITS).o memcpy_bwd_$(BITS).o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
diff --git a/arch/x86/lib/memcpy_bwd_64.S b/arch/x86/lib/memcpy_bwd_64.S
new file mode 100644
index 0000000..ca894e3
--- /dev/null
+++ b/arch/x86/lib/memcpy_bwd_64.S
@@ -0,0 +1,137 @@
+/* Copyright 2010 Miao Xie */
+
+#include <linux/linkage.h>
+
+#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
+
+/*
+ * __memcpy_bwd - Copy a memory block from the end to the beginning
+ *
+ * Input:
+ *  rdi destination
+ *  rsi source
+ *  rdx count
+ *
+ * Output:
+ *  rax original destination
+ */
+
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_bwd_c:
+	movq %rdi, %rax
+
+	addq %rdx, %rdi
+	addq %rdx, %rsi
+	leaq -8(%rdi), %rdi
+	leaq -8(%rsi), %rsi
+
+	std
+
+	movq %rdx, %rcx
+	shrq $3, %rcx
+	andq $7, %rdx
+	rep movsq
+
+	leaq 8(%rdi), %rdi
+	leaq 8(%rsi), %rsi
+	decq %rsi
+	decq %rdi
+	movq %rdx, %rcx
+	rep movsb
+
+	cld
+	ret
+.Lmemcpy_bwd_e:
+	.previous
+
+ENTRY(__memcpy_bwd)
+	CFI_STARTPROC
+
+	movq %rdi, %rax
+
+	addq %rdx, %rdi
+	addq %rdx, %rsi
+
+	movq %rdx, %rcx
+	shrq $6, %rcx
+	jz .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decq %rcx
+
+	leaq -64(%rdi), %rdi
+	leaq -64(%rsi), %rsi
+
+	movq 7*8(%rsi),	%r11
+	movq 6*8(%rsi),	%r8
+	movq %r11,	7*8(%rdi)
+	movq %r8,	6*8(%rdi)
+
+	movq 5*8(%rsi),	%r9
+	movq 4*8(%rsi),	%r10
+	movq %r9,	5*8(%rdi)
+	movq %r10,	4*8(%rdi)
+
+	movq 3*8(%rsi),	%r11
+	movq 2*8(%rsi),	%r8
+	movq %r11,	3*8(%rdi)
+	movq %r8,	2*8(%rdi)
+
+	movq 1*8(%rsi),	%r9
+	movq 0*8(%rsi),	%r10
+	movq %r9,	1*8(%rdi)
+	movq %r10,	0*8(%rdi)
+
+	jnz	.Lloop_64
+
+.Lhandle_tail:
+	movq %rdx, %rcx
+	andq $63, %rcx
+	shrq $3, %rcx
+	jz .Lhandle_7
+
+	.p2align 4
+.Lloop_8:
+	decq %rcx
+
+	leaq -8(%rsi), %rsi
+	leaq -8(%rdi), %rdi
+
+	movq (%rsi),	%r8
+	movq %r8,	(%rdi)
+
+	jnz .Lloop_8
+
+.Lhandle_7:
+	movq %rdx, %rcx
+	andq $7, %rcx
+	jz .Lend
+
+	.p2align 4
+.Lloop_1:
+	decq %rcx
+
+	decq %rsi
+	decq %rdi
+
+	movb (%rsi),	%r8b
+	movb %r8b,	(%rdi)
+
+	jnz .Lloop_1
+
+.Lend:
+	ret
+	CFI_ENDPROC
+ENDPROC(__memcpy_bwd)
+
+	.section .altinstructions, "a"
+	.align 8
+	.quad __memcpy_bwd
+	.quad .Lmemcpy_bwd_c
+	.word X86_FEATURE_REP_GOOD
+
+	.byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c
+	.byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c
+	.previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909..bd4cbcc 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,16 @@
 #undef memmove
 void *memmove(void *dest, const void *src, size_t count)
 {
-	if (dest < src) {
+	if (dest < src || dest - src >= count)
 		return memcpy(dest, src, count);
-	} else {
+	else if (count <= 64) {
 		char *p = dest + count;
 		const char *s = src + count;
 		while (count--)
 			*--p = *--s;
-	}
-	return dest;
+
+		return dest;
+	} else
+		return __memcpy_bwd(dest, src, count);
 }
 EXPORT_SYMBOL(memmove);
-- 
1.7.0.1
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andi Kleen

2010-Sep-16 06:48 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

> When the dest and the src do overlap and the memory area is large, memmove
> of
> x86_64 is very inefficient, and it led to bad performance, such as
btrfs''s
> file
> deletion performance. This patch improved the performance of memmove on
> x86_64
> by using __memcpy_bwd() instead of byte copy when doing large memory area
> copy
> (len > 64).

I still don''t understand why you don''t simply use a backwards
string copy (with std) ? That should be much simpler and
hopefully be as optimized for kernel copies on recent CPUs.

-Andi


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Miao Xie

2010-Sep-16 07:16 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen
wrote:>> When the dest and the src do overlap and the memory area is large,
memmove
>> of
>> x86_64 is very inefficient, and it led to bad performance, such as
btrfs''s
>> file
>> deletion performance. This patch improved the performance of memmove on
>> x86_64
>> by using __memcpy_bwd() instead of byte copy when doing large memory
area
>> copy
>> (len>  64).
>
>
> I still don''t understand why you don''t simply use a
backwards
> string copy (with std) ? That should be much simpler and
> hopefully be as optimized for kernel copies on recent CPUs.
But according to the comment of memcpy, some CPUs don''t support
"REP" instruction,
so I think we must implement a backwards string copy by other method for those
CPUs,
But that implement is complex, so I write it as a function -- __memcpy_bwd().

Thanks!
Miao
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andi Kleen

2010-Sep-16 08:40 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 15:16:31 +0800
Miao Xie <miaox@cn.fujitsu.com> wrote:
> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
> >> When the dest and the src do overlap and the memory area is large,
> >> memmove of
> >> x86_64 is very inefficient, and it led to bad performance, such as
> >> btrfs''s file
> >> deletion performance. This patch improved the performance of
> >> memmove on x86_64
> >> by using __memcpy_bwd() instead of byte copy when doing large
> >> memory area copy
> >> (len>  64).
> >
> >
> > I still don''t understand why you don''t simply use a
backwards
> > string copy (with std) ? That should be much simpler and
> > hopefully be as optimized for kernel copies on recent CPUs.
> 
> But according to the comment of memcpy, some CPUs don''t support
"REP"
> instruction, 
I think you misread the comment. REP prefixes are in all x86 CPUs.
On some very old systems it wasn''t optimized very well,
but it probably doesn''t make too much sense to optimize for those.
On newer CPUs in fact REP should be usually faster than 
an unrolled loop.

I''m not sure how optimized the backwards copy is, but most likely
it is optimized too.

Here''s an untested patch that implements backwards copy with string
instructions. Could you run it through your test harness?

-Andi


Implement the 64bit memmmove backwards case using string instructions

Signed-off-by: Andi Kleen <ak@linux.intel.com>

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e..6e8258d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -141,3 +141,28 @@ ENDPROC(__memcpy)
 	.byte .Lmemcpy_e - .Lmemcpy_c
 	.byte .Lmemcpy_e - .Lmemcpy_c
 	.previous
+
+/* 
+ * Copy memory backwards (for memmove)
+ * rdi target
+ * rsi source
+ * rdx count
+ */
+
+ENTRY(memcpy_backwards):
+	CFI_STARTPROC
+	std
+	movq %rdi, %rax
+	movl %edx, %ecx
+	add  %rdx, %rdi
+	add  %rdx, %rsi
+	shrl $3, %ecx
+	andl $7, %edx
+	rep movsq
+	movl %edx, %ecx
+	rep movsb
+	cld
+	ret
+	CFI_ENDPROC
+ENDPROC(memcpy_backwards)
+	
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909..6c00304 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -5,16 +5,16 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
+extern void asmlinkage memcpy_backwards(void *dst, const void *src,
+				        size_t count);
+
 #undef memmove
 void *memmove(void *dest, const void *src, size_t count)
 {
 	if (dest < src) {
 		return memcpy(dest, src, count);
 	} else {
-		char *p = dest + count;
-		const char *s = src + count;
-		while (count--)
-			*--p = *--s;
+		return memcpy_backwards(dest, src, count);
 	}
 	return dest;
 }



-- 
ak@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Miao Xie

2010-Sep-16 09:29 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 10:40:08 +0200, Andi Kleen wrote:> On Thu, 16 Sep 2010 15:16:31 +0800
> Miao Xie<miaox@cn.fujitsu.com>  wrote:
>
>> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
>>>> When the dest and the src do overlap and the memory area is
large,
>>>> memmove of
>>>> x86_64 is very inefficient, and it led to bad performance, such
as
>>>> btrfs''s file
>>>> deletion performance. This patch improved the performance of
>>>> memmove on x86_64
>>>> by using __memcpy_bwd() instead of byte copy when doing large
>>>> memory area copy
>>>> (len>   64).
>>>
>>>
>>> I still don''t understand why you don''t simply use
a backwards
>>> string copy (with std) ? That should be much simpler and
>>> hopefully be as optimized for kernel copies on recent CPUs.
>>
>> But according to the comment of memcpy, some CPUs don''t
support "REP"
>> instruction,
>
> I think you misread the comment. REP prefixes are in all x86 CPUs.
> On some very old systems it wasn''t optimized very well,
> but it probably doesn''t make too much sense to optimize for those.
> On newer CPUs in fact REP should be usually faster than
> an unrolled loop.
>
> I''m not sure how optimized the backwards copy is, but most likely
> it is optimized too.
>
> Here''s an untested patch that implements backwards copy with
string
> instructions. Could you run it through your test harness?
Ok, I''ll do it.
> +
> +/*
> + * Copy memory backwards (for memmove)
> + * rdi target
> + * rsi source
> + * rdx count
> + */
> +
> +ENTRY(memcpy_backwards):
s/://
> +	CFI_STARTPROC
> +	std
> +	movq %rdi, %rax
> +	movl %edx, %ecx
> +	add  %rdx, %rdi
> +	add  %rdx, %rsi
-	add  %rdx, %rdi
-	add  %rdx, %rsi
+	addq  %rdx, %rdi
+	addq  %rdx, %rsi

Besides that, the address that %rdi/%rsi pointed to is over the end of
mempry area that going to be copied, so we must tune the address to be
correct.
+	leaq -8(%rdi), %rdi
+	leaq -8(%rsi), %rsi
> +	shrl $3, %ecx
> +	andl $7, %edx
> +	rep movsq
The same as above.
+	leaq 8(%rdi), %rdi
+	leaq 8(%rsi), %rsi
+	decq %rsi
+	decq %rdi
> +	movl %edx, %ecx
> +	rep movsb
> +	cld
> +	ret
> +	CFI_ENDPROC
> +ENDPROC(memcpy_backwards)
> +	
> diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
> index 0a33909..6c00304 100644
> --- a/arch/x86/lib/memmove_64.c
> +++ b/arch/x86/lib/memmove_64.c
> @@ -5,16 +5,16 @@
>   #include<linux/string.h>
>   #include<linux/module.h>
>
> +extern void asmlinkage memcpy_backwards(void *dst, const void *src,
> +				        size_t count);
The type of the return value must be "void *".

Thanks
Miao
> +
>   #undef memmove
>   void *memmove(void *dest, const void *src, size_t count)
>   {
>   	if (dest<  src) {
>   		return memcpy(dest, src, count);
>   	} else {
> -		char *p = dest + count;
> -		const char *s = src + count;
> -		while (count--)
> -			*--p = *--s;
> +		return memcpy_backwards(dest, src, count);
>   	}
>   	return dest;
>   }
>--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andi Kleen

2010-Sep-16 10:11 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 17:29:32 +0800
Miao Xie <miaox@cn.fujitsu.com> wrote:


Ok was a very broken patch. Sorry should have really done some more
work on it. Anyways hopefully the corrected version is good for
testing.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only.

Miao Xie

2010-Sep-16 10:47 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 12:11:41 +0200, Andi Kleen wrote:> On Thu, 16 Sep 2010 17:29:32 +0800
> Miao Xie<miaox@cn.fujitsu.com>  wrote:
>
>
> Ok was a very broken patch. Sorry should have really done some more
> work on it. Anyways hopefully the corrected version is good for
> testing.
>
> -Andi
>
title: x86_64/lib: improve the performance of memmove

Implement the 64bit memmmove backwards case using string instructions

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
  arch/x86/lib/memcpy_64.S  |   29 +++++++++++++++++++++++++++++
  arch/x86/lib/memmove_64.c |    8 ++++----
  2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e..9de5e9a 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -141,3 +141,32 @@ ENDPROC(__memcpy)
  	.byte .Lmemcpy_e - .Lmemcpy_c
  	.byte .Lmemcpy_e - .Lmemcpy_c
  	.previous
+
+/*
+ * Copy memory backwards (for memmove)
+ * rdi target
+ * rsi source
+ * rdx count
+ */
+
+ENTRY(memcpy_backwards)
+	CFI_STARTPROC
+	std
+	movq %rdi, %rax
+	movl %edx, %ecx
+	addq %rdx, %rdi
+	addq %rdx, %rsi
+	leaq -8(%rdi), %rdi
+	leaq -8(%rsi), %rsi
+	shrl $3, %ecx
+	andl $7, %edx
+	rep movsq
+	addq $7, %rdi
+	addq $7, %rsi
+	movl %edx, %ecx
+	rep movsb
+	cld
+	ret
+	CFI_ENDPROC
+ENDPROC(memcpy_backwards)
+	
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909..6774fd8 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -5,16 +5,16 @@
  #include <linux/string.h>
  #include <linux/module.h>
  
+extern void * asmlinkage memcpy_backwards(void *dst, const void *src,
+				        size_t count);
+
  #undef memmove
  void *memmove(void *dest, const void *src, size_t count)
  {
  	if (dest < src) {
  		return memcpy(dest, src, count);
  	} else {
-		char *p = dest + count;
-		const char *s = src + count;
-		while (count--)
-			*--p = *--s;
+		return memcpy_backwards(dest, src, count);
  	}
  	return dest;
  }
-- 
1.7.0.1

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Miao Xie

2010-Sep-16 11:47 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 18:47:59 +0800
, Miao Xie wrote:> On Thu, 16 Sep 2010 12:11:41 +0200, Andi Kleen wrote:
>> On Thu, 16 Sep 2010 17:29:32 +0800
>> Miao Xie<miaox@cn.fujitsu.com> wrote:
>>
>>
>> Ok was a very broken patch. Sorry should have really done some more
>> work on it. Anyways hopefully the corrected version is good for
>> testing.
>>
>> -Andi
The test result is following:
Len	Src Unalign	Dest Unalign	Patch applied	Without Patch	
---	-----------	------------	-------------	-------------
8	0		0		0s 421117us	0s 70203us
8	0		3		0s 252622us	0s 42114us
8	0		7		0s 252663us	0s 42111us
8	3		0		0s 252666us	0s 42114us
8	3		3		0s 252667us	0s 42113us
8	3		7		0s 252667us	0s 42112us
32	0		0		0s 252672us	0s 114301us
32	0		3		0s 252676us	0s 114306us
32	0		7		0s 252663us	0s 114300us
32	3		0		0s 252661us	0s 114305us
32	3		3		0s 252663us	0s 114300us
32	3		7		0s 252668us	0s 114304us
64	0		0		0s 252672us	0s 236119us
64	0		3		0s 264671us	0s 236120us
64	0		7		0s 264702us	0s 236127us
64	3		0		0s 270701us	0s 236128us
64	3		3		0s 287236us	0s 236809us
64	3		7		0s 287257us	0s 236123us

According to the above result, old version is better than the new one when the
memory area is small.

Len	Src Unalign	Dest Unalign	Patch applied	Without Patch	
---	-----------	------------	-------------	-------------
256	0		0		0s 281886us	0s 813660us
256	0		3		0s 332169us	0s 813645us
256	0		7		0s 342961us	0s 813639us
256	3		0		0s 305305us	0s 813634us
256	3		3		0s 386939us	0s 813638us
256	3		7		0s 370511us	0s 814335us
512	0		0		0s 310716us	1s 584677us
512	0		3		0s 456420us	1s 583353us
512	0		7		0s 468236us	1s 583248us
512	3		0		0s 493987us	1s 583659us
512	3		3		0s 588041us	1s 584294us
512	3		7		0s 605489us	1s 583650us
1024	0		0		0s 406971us	3s 123644us
1024	0		3		0s 748419us	3s 126514us
1024	0		7		0s 756153us	3s 127178us
1024	3		0		0s 854681us	3s 130013us
1024	3		3		1s 46828us	3s 140190us
1024	3		7		1s 35886us	3s 135508us

the new version is better when the memory area is large.

Thanks!
Miao
>>
>
> title: x86_64/lib: improve the performance of memmove
>
> Implement the 64bit memmmove backwards case using string instructions
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
> arch/x86/lib/memcpy_64.S | 29 +++++++++++++++++++++++++++++
> arch/x86/lib/memmove_64.c | 8 ++++----
> 2 files changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index bcbcd1e..9de5e9a 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -141,3 +141,32 @@ ENDPROC(__memcpy)
> .byte .Lmemcpy_e - .Lmemcpy_c
> .byte .Lmemcpy_e - .Lmemcpy_c
> .previous
> +
> +/*
> + * Copy memory backwards (for memmove)
> + * rdi target
> + * rsi source
> + * rdx count
> + */
> +
> +ENTRY(memcpy_backwards)
> + CFI_STARTPROC
> + std
> + movq %rdi, %rax
> + movl %edx, %ecx
> + addq %rdx, %rdi
> + addq %rdx, %rsi
> + leaq -8(%rdi), %rdi
> + leaq -8(%rsi), %rsi
> + shrl $3, %ecx
> + andl $7, %edx
> + rep movsq
> + addq $7, %rdi
> + addq $7, %rsi
> + movl %edx, %ecx
> + rep movsb
> + cld
> + ret
> + CFI_ENDPROC
> +ENDPROC(memcpy_backwards)
> +
> diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
> index 0a33909..6774fd8 100644
> --- a/arch/x86/lib/memmove_64.c
> +++ b/arch/x86/lib/memmove_64.c
> @@ -5,16 +5,16 @@
> #include <linux/string.h>
> #include <linux/module.h>
>
> +extern void * asmlinkage memcpy_backwards(void *dst, const void *src,
> + size_t count);
> +
> #undef memmove
> void *memmove(void *dest, const void *src, size_t count)
> {
> if (dest < src) {
> return memcpy(dest, src, count);
> } else {
> - char *p = dest + count;
> - const char *s = src + count;
> - while (count--)
> - *--p = *--s;
> + return memcpy_backwards(dest, src, count);
> }
> return dest;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

George Spelvin

2010-Sep-16 12:13 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

>  void *memmove(void *dest, const void *src, size_t count)
>  {
>  	if (dest < src) {
>  		return memcpy(dest, src, count);
>  	} else {
> -		char *p = dest + count;
> -		const char *s = src + count;
> -		while (count--)
> -			*--p = *--s;
> +		return memcpy_backwards(dest, src, count);
>  	}
>  	return dest;
>  }
Er... presumably, the forward-copy case is somewhat better optimized,
so should be preferred if the areas don''t overlap; that is, dest >
src by more than the sount.  Assuming that size_t can hold a pointer:

	if ((size_t)src - (size_t)dest >= count)
		return memcpy(dest, src, count);
	else
		return memcpy_backwards(dest, src, count);

Or, using GCC''s arithmetic on void * extension,
	if ((size_t)(src - dest) >= count)
		... etc.

If src == dest, it doesn''t matter which you use.  You could skip the
copy entirely, but presumably that case doesn''t arise often enough to
be worth testing for:

	if ((size_t)(src - dest) >= count)
		return memcpy(dest, src, count);
	else if (src - dest != 0)
		return memcpy_backwards(dest, src, count);
	else
		return dest;

ykzhao

2010-Sep-17 00:55 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 2010-09-16 at 15:16 +0800, Miao Xie wrote:> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
> >> When the dest and the src do overlap and the memory area is large,
memmove
> >> of
> >> x86_64 is very inefficient, and it led to bad performance, such as
btrfs''s
> >> file
> >> deletion performance. This patch improved the performance of
memmove on
> >> x86_64
> >> by using __memcpy_bwd() instead of byte copy when doing large
memory area
> >> copy
> >> (len>  64).
> >
> >
> > I still don''t understand why you don''t simply use a
backwards
> > string copy (with std) ? That should be much simpler and
> > hopefully be as optimized for kernel copies on recent CPUs.
> 
> But according to the comment of memcpy, some CPUs don''t support
"REP" instruction,
Where do you find that the "REP" instruction is not supported on some
CPUs? The comment in arch/x86/lib/memcpy_64.s only states that some CPUs
will run faster when using string copy instruction. 
  > so I think we must implement a backwards string copy by other method for
those CPUs,
> But that implement is complex, so I write it as a function --
__memcpy_bwd().
Will you please look at tip/x86/ tree(mem branch)? The memory copy on
x86_64 is already optimized. 

thanks.
    Yakui> 
> Thanks!
> Miao
> --
> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Miao Xie

2010-Sep-17 03:37 UTC

head link

Re: [PATCH] x86_64/lib: improve the performance of memmove

On Fri, 17 Sep 2010 08:55:18 +0800, ykzhao wrote:> On Thu, 2010-09-16 at 15:16 +0800, Miao Xie wrote:
>> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
>>>> When the dest and the src do overlap and the memory area is
large, memmove
>>>> of
>>>> x86_64 is very inefficient, and it led to bad performance, such
as btrfs''s
>>>> file
>>>> deletion performance. This patch improved the performance of
memmove on
>>>> x86_64
>>>> by using __memcpy_bwd() instead of byte copy when doing large
memory area
>>>> copy
>>>> (len>   64).
>>>
>>>
>>> I still don''t understand why you don''t simply use
a backwards
>>> string copy (with std) ? That should be much simpler and
>>> hopefully be as optimized for kernel copies on recent CPUs.
>>
>> But according to the comment of memcpy, some CPUs don''t
support "REP" instruction,
> 
> Where do you find that the "REP" instruction is not supported on
some
> CPUs? The comment in arch/x86/lib/memcpy_64.s only states that some CPUs
> will run faster when using string copy instruction.
Sorry! I misread the comment.
>> so I think we must implement a backwards string copy by other method
for those CPUs,
>> But that implement is complex, so I write it as a function --
__memcpy_bwd().
> 
> Will you please look at tip/x86/ tree(mem branch)? The memory copy on
> x86_64 is already optimized.
Thanks for your reminding! It is very helpful.
Miao
 > thanks.
>      Yakui
>>
>> Thanks!
>> Miao
>> --
>> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Btrfs devel - Sep 2010 - [PATCH] x86_64/lib: improve the performance of memmove

[PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove

Re: [PATCH] x86_64/lib: improve the performance of memmove