When the dest and the src do overlap and the memory area is large, memmove of x86_64 is very inefficient, and it led to bad performance, such as btrfs''s file deletion performance. This patch improved the performance of memmove on x86_64 by using __memcpy_bwd() instead of byte copy when doing large memory area copy (len > 64). I have tested this patchset by doing 500 bytes memory copy for 50000 times with various alignments and buffer sizes on my x86_64 box: Len Src Unalign Dest Unalign Without Patch Patch applied --- ----------- ------------ ------------- ------------- 256 0 0 0s 815158us 0s 249647us 256 0 4 0s 816059us 0s 324210us 256 0 7 0s 815192us 0s 324254us 256 3 0 0s 815179us 0s 325991us 256 3 1 0s 815161us 0s 378462us 256 3 4 0s 815154us 0s 779306us 256 3 7 0s 815151us 0s 782924us 256 7 0 0s 815839us 0s 325524us 256 7 4 0s 815149us 0s 375658us 256 7 7 0s 815160us 0s 374488us 1024 0 0 3s 125891us 0s 437662us 1024 0 1 3s 125940us 0s 777524us 1024 0 4 3s 159788us 0s 778850us 1024 0 7 3s 155177us 0s 733927us 1024 4 0 3s 118323us 0s 830167us 1024 4 4 3s 129124us 0s 962505us 1024 4 7 3s 123456us 2s 600326us After appling this patchset, the performance of the file creation and deletion on some filesystem become better. I have tested it with the following benchmark tool on my x86_64 box. http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3 Test steps: # ./creat_unlink 50000 The result(Total time): Ext4: 2.6.36-rc4 2.6.36-rc4 + patch file creation 0.737007 0.701888 4.8%UP file deletion 0.422226 0.413457 2.1%UP Btrfs: 2.6.36-rc4 2.6.36-rc4 + patch file creation 0.977638 0.935208 4.3%UP file deletion 1.327140 1.221073 8%UP Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- arch/x86/include/asm/string_64.h | 1 + arch/x86/lib/Makefile | 2 +- arch/x86/lib/memcpy_bwd_64.S | 137 ++++++++++++++++++++++++++++++++++++++ arch/x86/lib/memmove_64.c | 10 ++- 4 files changed, 145 insertions(+), 5 deletions(-) create mode 100644 arch/x86/lib/memcpy_bwd_64.S diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c46..4e64a87 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -55,6 +55,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); void *memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMMOVE +extern void *__memcpy_bwd(void *dest, const void *src, size_t count); void *memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf07..ab241df 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -19,7 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o -lib-y += memcpy_$(BITS).o +lib-y += memcpy_$(BITS).o memcpy_bwd_$(BITS).o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o obj-y += msr.o msr-reg.o msr-reg-export.o diff --git a/arch/x86/lib/memcpy_bwd_64.S b/arch/x86/lib/memcpy_bwd_64.S new file mode 100644 index 0000000..ca894e3 --- /dev/null +++ b/arch/x86/lib/memcpy_bwd_64.S @@ -0,0 +1,137 @@ +/* Copyright 2010 Miao Xie */ + +#include <linux/linkage.h> + +#include <asm/cpufeature.h> +#include <asm/dwarf2.h> + +/* + * __memcpy_bwd - Copy a memory block from the end to the beginning + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ + + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_bwd_c: + movq %rdi, %rax + + addq %rdx, %rdi + addq %rdx, %rsi + leaq -8(%rdi), %rdi + leaq -8(%rsi), %rsi + + std + + movq %rdx, %rcx + shrq $3, %rcx + andq $7, %rdx + rep movsq + + leaq 8(%rdi), %rdi + leaq 8(%rsi), %rsi + decq %rsi + decq %rdi + movq %rdx, %rcx + rep movsb + + cld + ret +.Lmemcpy_bwd_e: + .previous + +ENTRY(__memcpy_bwd) + CFI_STARTPROC + + movq %rdi, %rax + + addq %rdx, %rdi + addq %rdx, %rsi + + movq %rdx, %rcx + shrq $6, %rcx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decq %rcx + + leaq -64(%rdi), %rdi + leaq -64(%rsi), %rsi + + movq 7*8(%rsi), %r11 + movq 6*8(%rsi), %r8 + movq %r11, 7*8(%rdi) + movq %r8, 6*8(%rdi) + + movq 5*8(%rsi), %r9 + movq 4*8(%rsi), %r10 + movq %r9, 5*8(%rdi) + movq %r10, 4*8(%rdi) + + movq 3*8(%rsi), %r11 + movq 2*8(%rsi), %r8 + movq %r11, 3*8(%rdi) + movq %r8, 2*8(%rdi) + + movq 1*8(%rsi), %r9 + movq 0*8(%rsi), %r10 + movq %r9, 1*8(%rdi) + movq %r10, 0*8(%rdi) + + jnz .Lloop_64 + +.Lhandle_tail: + movq %rdx, %rcx + andq $63, %rcx + shrq $3, %rcx + jz .Lhandle_7 + + .p2align 4 +.Lloop_8: + decq %rcx + + leaq -8(%rsi), %rsi + leaq -8(%rdi), %rdi + + movq (%rsi), %r8 + movq %r8, (%rdi) + + jnz .Lloop_8 + +.Lhandle_7: + movq %rdx, %rcx + andq $7, %rcx + jz .Lend + + .p2align 4 +.Lloop_1: + decq %rcx + + decq %rsi + decq %rdi + + movb (%rsi), %r8b + movb %r8b, (%rdi) + + jnz .Lloop_1 + +.Lend: + ret + CFI_ENDPROC +ENDPROC(__memcpy_bwd) + + .section .altinstructions, "a" + .align 8 + .quad __memcpy_bwd + .quad .Lmemcpy_bwd_c + .word X86_FEATURE_REP_GOOD + + .byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c + .byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c + .previous diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 0a33909..bd4cbcc 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -8,14 +8,16 @@ #undef memmove void *memmove(void *dest, const void *src, size_t count) { - if (dest < src) { + if (dest < src || dest - src >= count) return memcpy(dest, src, count); - } else { + else if (count <= 64) { char *p = dest + count; const char *s = src + count; while (count--) *--p = *--s; - } - return dest; + + return dest; + } else + return __memcpy_bwd(dest, src, count); } EXPORT_SYMBOL(memmove); -- 1.7.0.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andi Kleen
2010-Sep-16 06:48 UTC
Re: [PATCH] x86_64/lib: improve the performance of memmove
> When the dest and the src do overlap and the memory area is large, memmove > of > x86_64 is very inefficient, and it led to bad performance, such as btrfs''s > file > deletion performance. This patch improved the performance of memmove on > x86_64 > by using __memcpy_bwd() instead of byte copy when doing large memory area > copy > (len > 64).I still don''t understand why you don''t simply use a backwards string copy (with std) ? That should be much simpler and hopefully be as optimized for kernel copies on recent CPUs. -Andi -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:>> When the dest and the src do overlap and the memory area is large, memmove >> of >> x86_64 is very inefficient, and it led to bad performance, such as btrfs''s >> file >> deletion performance. This patch improved the performance of memmove on >> x86_64 >> by using __memcpy_bwd() instead of byte copy when doing large memory area >> copy >> (len> 64). > > > I still don''t understand why you don''t simply use a backwards > string copy (with std) ? That should be much simpler and > hopefully be as optimized for kernel copies on recent CPUs.But according to the comment of memcpy, some CPUs don''t support "REP" instruction, so I think we must implement a backwards string copy by other method for those CPUs, But that implement is complex, so I write it as a function -- __memcpy_bwd(). Thanks! Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andi Kleen
2010-Sep-16 08:40 UTC
Re: [PATCH] x86_64/lib: improve the performance of memmove
On Thu, 16 Sep 2010 15:16:31 +0800 Miao Xie <miaox@cn.fujitsu.com> wrote:> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote: > >> When the dest and the src do overlap and the memory area is large, > >> memmove of > >> x86_64 is very inefficient, and it led to bad performance, such as > >> btrfs''s file > >> deletion performance. This patch improved the performance of > >> memmove on x86_64 > >> by using __memcpy_bwd() instead of byte copy when doing large > >> memory area copy > >> (len> 64). > > > > > > I still don''t understand why you don''t simply use a backwards > > string copy (with std) ? That should be much simpler and > > hopefully be as optimized for kernel copies on recent CPUs. > > But according to the comment of memcpy, some CPUs don''t support "REP" > instruction,I think you misread the comment. REP prefixes are in all x86 CPUs. On some very old systems it wasn''t optimized very well, but it probably doesn''t make too much sense to optimize for those. On newer CPUs in fact REP should be usually faster than an unrolled loop. I''m not sure how optimized the backwards copy is, but most likely it is optimized too. Here''s an untested patch that implements backwards copy with string instructions. Could you run it through your test harness? -Andi Implement the 64bit memmmove backwards case using string instructions Signed-off-by: Andi Kleen <ak@linux.intel.com> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e..6e8258d 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -141,3 +141,28 @@ ENDPROC(__memcpy) .byte .Lmemcpy_e - .Lmemcpy_c .byte .Lmemcpy_e - .Lmemcpy_c .previous + +/* + * Copy memory backwards (for memmove) + * rdi target + * rsi source + * rdx count + */ + +ENTRY(memcpy_backwards): + CFI_STARTPROC + std + movq %rdi, %rax + movl %edx, %ecx + add %rdx, %rdi + add %rdx, %rsi + shrl $3, %ecx + andl $7, %edx + rep movsq + movl %edx, %ecx + rep movsb + cld + ret + CFI_ENDPROC +ENDPROC(memcpy_backwards) + diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 0a33909..6c00304 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -5,16 +5,16 @@ #include <linux/string.h> #include <linux/module.h> +extern void asmlinkage memcpy_backwards(void *dst, const void *src, + size_t count); + #undef memmove void *memmove(void *dest, const void *src, size_t count) { if (dest < src) { return memcpy(dest, src, count); } else { - char *p = dest + count; - const char *s = src + count; - while (count--) - *--p = *--s; + return memcpy_backwards(dest, src, count); } return dest; } -- ak@linux.intel.com -- Speaking for myself only. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 16 Sep 2010 10:40:08 +0200, Andi Kleen wrote:> On Thu, 16 Sep 2010 15:16:31 +0800 > Miao Xie<miaox@cn.fujitsu.com> wrote: > >> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote: >>>> When the dest and the src do overlap and the memory area is large, >>>> memmove of >>>> x86_64 is very inefficient, and it led to bad performance, such as >>>> btrfs''s file >>>> deletion performance. This patch improved the performance of >>>> memmove on x86_64 >>>> by using __memcpy_bwd() instead of byte copy when doing large >>>> memory area copy >>>> (len> 64). >>> >>> >>> I still don''t understand why you don''t simply use a backwards >>> string copy (with std) ? That should be much simpler and >>> hopefully be as optimized for kernel copies on recent CPUs. >> >> But according to the comment of memcpy, some CPUs don''t support "REP" >> instruction, > > I think you misread the comment. REP prefixes are in all x86 CPUs. > On some very old systems it wasn''t optimized very well, > but it probably doesn''t make too much sense to optimize for those. > On newer CPUs in fact REP should be usually faster than > an unrolled loop. > > I''m not sure how optimized the backwards copy is, but most likely > it is optimized too. > > Here''s an untested patch that implements backwards copy with string > instructions. Could you run it through your test harness?Ok, I''ll do it.> + > +/* > + * Copy memory backwards (for memmove) > + * rdi target > + * rsi source > + * rdx count > + */ > + > +ENTRY(memcpy_backwards):s/://> + CFI_STARTPROC > + std > + movq %rdi, %rax > + movl %edx, %ecx > + add %rdx, %rdi > + add %rdx, %rsi- add %rdx, %rdi - add %rdx, %rsi + addq %rdx, %rdi + addq %rdx, %rsi Besides that, the address that %rdi/%rsi pointed to is over the end of mempry area that going to be copied, so we must tune the address to be correct. + leaq -8(%rdi), %rdi + leaq -8(%rsi), %rsi> + shrl $3, %ecx > + andl $7, %edx > + rep movsqThe same as above. + leaq 8(%rdi), %rdi + leaq 8(%rsi), %rsi + decq %rsi + decq %rdi> + movl %edx, %ecx > + rep movsb > + cld > + ret > + CFI_ENDPROC > +ENDPROC(memcpy_backwards) > + > diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c > index 0a33909..6c00304 100644 > --- a/arch/x86/lib/memmove_64.c > +++ b/arch/x86/lib/memmove_64.c > @@ -5,16 +5,16 @@ > #include<linux/string.h> > #include<linux/module.h> > > +extern void asmlinkage memcpy_backwards(void *dst, const void *src, > + size_t count);The type of the return value must be "void *". Thanks Miao> + > #undef memmove > void *memmove(void *dest, const void *src, size_t count) > { > if (dest< src) { > return memcpy(dest, src, count); > } else { > - char *p = dest + count; > - const char *s = src + count; > - while (count--) > - *--p = *--s; > + return memcpy_backwards(dest, src, count); > } > return dest; > } >-- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andi Kleen
2010-Sep-16 10:11 UTC
Re: [PATCH] x86_64/lib: improve the performance of memmove
On Thu, 16 Sep 2010 17:29:32 +0800 Miao Xie <miaox@cn.fujitsu.com> wrote: Ok was a very broken patch. Sorry should have really done some more work on it. Anyways hopefully the corrected version is good for testing. -Andi -- ak@linux.intel.com -- Speaking for myself only.
On Thu, 16 Sep 2010 12:11:41 +0200, Andi Kleen wrote:> On Thu, 16 Sep 2010 17:29:32 +0800 > Miao Xie<miaox@cn.fujitsu.com> wrote: > > > Ok was a very broken patch. Sorry should have really done some more > work on it. Anyways hopefully the corrected version is good for > testing. > > -Andi >title: x86_64/lib: improve the performance of memmove Implement the 64bit memmmove backwards case using string instructions Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- arch/x86/lib/memcpy_64.S | 29 +++++++++++++++++++++++++++++ arch/x86/lib/memmove_64.c | 8 ++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e..9de5e9a 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -141,3 +141,32 @@ ENDPROC(__memcpy) .byte .Lmemcpy_e - .Lmemcpy_c .byte .Lmemcpy_e - .Lmemcpy_c .previous + +/* + * Copy memory backwards (for memmove) + * rdi target + * rsi source + * rdx count + */ + +ENTRY(memcpy_backwards) + CFI_STARTPROC + std + movq %rdi, %rax + movl %edx, %ecx + addq %rdx, %rdi + addq %rdx, %rsi + leaq -8(%rdi), %rdi + leaq -8(%rsi), %rsi + shrl $3, %ecx + andl $7, %edx + rep movsq + addq $7, %rdi + addq $7, %rsi + movl %edx, %ecx + rep movsb + cld + ret + CFI_ENDPROC +ENDPROC(memcpy_backwards) + diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 0a33909..6774fd8 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -5,16 +5,16 @@ #include <linux/string.h> #include <linux/module.h> +extern void * asmlinkage memcpy_backwards(void *dst, const void *src, + size_t count); + #undef memmove void *memmove(void *dest, const void *src, size_t count) { if (dest < src) { return memcpy(dest, src, count); } else { - char *p = dest + count; - const char *s = src + count; - while (count--) - *--p = *--s; + return memcpy_backwards(dest, src, count); } return dest; } -- 1.7.0.1 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 16 Sep 2010 18:47:59 +0800 , Miao Xie wrote:> On Thu, 16 Sep 2010 12:11:41 +0200, Andi Kleen wrote: >> On Thu, 16 Sep 2010 17:29:32 +0800 >> Miao Xie<miaox@cn.fujitsu.com> wrote: >> >> >> Ok was a very broken patch. Sorry should have really done some more >> work on it. Anyways hopefully the corrected version is good for >> testing. >> >> -AndiThe test result is following: Len Src Unalign Dest Unalign Patch applied Without Patch --- ----------- ------------ ------------- ------------- 8 0 0 0s 421117us 0s 70203us 8 0 3 0s 252622us 0s 42114us 8 0 7 0s 252663us 0s 42111us 8 3 0 0s 252666us 0s 42114us 8 3 3 0s 252667us 0s 42113us 8 3 7 0s 252667us 0s 42112us 32 0 0 0s 252672us 0s 114301us 32 0 3 0s 252676us 0s 114306us 32 0 7 0s 252663us 0s 114300us 32 3 0 0s 252661us 0s 114305us 32 3 3 0s 252663us 0s 114300us 32 3 7 0s 252668us 0s 114304us 64 0 0 0s 252672us 0s 236119us 64 0 3 0s 264671us 0s 236120us 64 0 7 0s 264702us 0s 236127us 64 3 0 0s 270701us 0s 236128us 64 3 3 0s 287236us 0s 236809us 64 3 7 0s 287257us 0s 236123us According to the above result, old version is better than the new one when the memory area is small. Len Src Unalign Dest Unalign Patch applied Without Patch --- ----------- ------------ ------------- ------------- 256 0 0 0s 281886us 0s 813660us 256 0 3 0s 332169us 0s 813645us 256 0 7 0s 342961us 0s 813639us 256 3 0 0s 305305us 0s 813634us 256 3 3 0s 386939us 0s 813638us 256 3 7 0s 370511us 0s 814335us 512 0 0 0s 310716us 1s 584677us 512 0 3 0s 456420us 1s 583353us 512 0 7 0s 468236us 1s 583248us 512 3 0 0s 493987us 1s 583659us 512 3 3 0s 588041us 1s 584294us 512 3 7 0s 605489us 1s 583650us 1024 0 0 0s 406971us 3s 123644us 1024 0 3 0s 748419us 3s 126514us 1024 0 7 0s 756153us 3s 127178us 1024 3 0 0s 854681us 3s 130013us 1024 3 3 1s 46828us 3s 140190us 1024 3 7 1s 35886us 3s 135508us the new version is better when the memory area is large. Thanks! Miao>> > > title: x86_64/lib: improve the performance of memmove > > Implement the 64bit memmmove backwards case using string instructions > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> > --- > arch/x86/lib/memcpy_64.S | 29 +++++++++++++++++++++++++++++ > arch/x86/lib/memmove_64.c | 8 ++++---- > 2 files changed, 33 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S > index bcbcd1e..9de5e9a 100644 > --- a/arch/x86/lib/memcpy_64.S > +++ b/arch/x86/lib/memcpy_64.S > @@ -141,3 +141,32 @@ ENDPROC(__memcpy) > .byte .Lmemcpy_e - .Lmemcpy_c > .byte .Lmemcpy_e - .Lmemcpy_c > .previous > + > +/* > + * Copy memory backwards (for memmove) > + * rdi target > + * rsi source > + * rdx count > + */ > + > +ENTRY(memcpy_backwards) > + CFI_STARTPROC > + std > + movq %rdi, %rax > + movl %edx, %ecx > + addq %rdx, %rdi > + addq %rdx, %rsi > + leaq -8(%rdi), %rdi > + leaq -8(%rsi), %rsi > + shrl $3, %ecx > + andl $7, %edx > + rep movsq > + addq $7, %rdi > + addq $7, %rsi > + movl %edx, %ecx > + rep movsb > + cld > + ret > + CFI_ENDPROC > +ENDPROC(memcpy_backwards) > + > diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c > index 0a33909..6774fd8 100644 > --- a/arch/x86/lib/memmove_64.c > +++ b/arch/x86/lib/memmove_64.c > @@ -5,16 +5,16 @@ > #include <linux/string.h> > #include <linux/module.h> > > +extern void * asmlinkage memcpy_backwards(void *dst, const void *src, > + size_t count); > + > #undef memmove > void *memmove(void *dest, const void *src, size_t count) > { > if (dest < src) { > return memcpy(dest, src, count); > } else { > - char *p = dest + count; > - const char *s = src + count; > - while (count--) > - *--p = *--s; > + return memcpy_backwards(dest, src, count); > } > return dest; > }-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
George Spelvin
2010-Sep-16 12:13 UTC
Re: [PATCH] x86_64/lib: improve the performance of memmove
> void *memmove(void *dest, const void *src, size_t count) > { > if (dest < src) { > return memcpy(dest, src, count); > } else { > - char *p = dest + count; > - const char *s = src + count; > - while (count--) > - *--p = *--s; > + return memcpy_backwards(dest, src, count); > } > return dest; > }Er... presumably, the forward-copy case is somewhat better optimized, so should be preferred if the areas don''t overlap; that is, dest > src by more than the sount. Assuming that size_t can hold a pointer: if ((size_t)src - (size_t)dest >= count) return memcpy(dest, src, count); else return memcpy_backwards(dest, src, count); Or, using GCC''s arithmetic on void * extension, if ((size_t)(src - dest) >= count) ... etc. If src == dest, it doesn''t matter which you use. You could skip the copy entirely, but presumably that case doesn''t arise often enough to be worth testing for: if ((size_t)(src - dest) >= count) return memcpy(dest, src, count); else if (src - dest != 0) return memcpy_backwards(dest, src, count); else return dest;
On Thu, 2010-09-16 at 15:16 +0800, Miao Xie wrote:> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote: > >> When the dest and the src do overlap and the memory area is large, memmove > >> of > >> x86_64 is very inefficient, and it led to bad performance, such as btrfs''s > >> file > >> deletion performance. This patch improved the performance of memmove on > >> x86_64 > >> by using __memcpy_bwd() instead of byte copy when doing large memory area > >> copy > >> (len> 64). > > > > > > I still don''t understand why you don''t simply use a backwards > > string copy (with std) ? That should be much simpler and > > hopefully be as optimized for kernel copies on recent CPUs. > > But according to the comment of memcpy, some CPUs don''t support "REP" instruction,Where do you find that the "REP" instruction is not supported on some CPUs? The comment in arch/x86/lib/memcpy_64.s only states that some CPUs will run faster when using string copy instruction.> so I think we must implement a backwards string copy by other method for those CPUs, > But that implement is complex, so I write it as a function -- __memcpy_bwd().Will you please look at tip/x86/ tree(mem branch)? The memory copy on x86_64 is already optimized. thanks. Yakui> > Thanks! > Miao > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 17 Sep 2010 08:55:18 +0800, ykzhao wrote:> On Thu, 2010-09-16 at 15:16 +0800, Miao Xie wrote: >> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote: >>>> When the dest and the src do overlap and the memory area is large, memmove >>>> of >>>> x86_64 is very inefficient, and it led to bad performance, such as btrfs''s >>>> file >>>> deletion performance. This patch improved the performance of memmove on >>>> x86_64 >>>> by using __memcpy_bwd() instead of byte copy when doing large memory area >>>> copy >>>> (len> 64). >>> >>> >>> I still don''t understand why you don''t simply use a backwards >>> string copy (with std) ? That should be much simpler and >>> hopefully be as optimized for kernel copies on recent CPUs. >> >> But according to the comment of memcpy, some CPUs don''t support "REP" instruction, > > Where do you find that the "REP" instruction is not supported on some > CPUs? The comment in arch/x86/lib/memcpy_64.s only states that some CPUs > will run faster when using string copy instruction.Sorry! I misread the comment.>> so I think we must implement a backwards string copy by other method for those CPUs, >> But that implement is complex, so I write it as a function -- __memcpy_bwd(). > > Will you please look at tip/x86/ tree(mem branch)? The memory copy on > x86_64 is already optimized.Thanks for your reminding! It is very helpful. Miao> thanks. > Yakui >> >> Thanks! >> Miao >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > >-- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html