Jan Beulich
2010-May-11  09:05 UTC
[Xen-devel] [PATCH] x86-32: use __builtin_{memcpy,memset}
Following a change in Linux 2.6.33, make x86-32 always use
__builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
certain intermediate gcc revisions generating out-of-range-array-index
warnings with the current inline implementation.
It may be worthwhile considering to make this the case for x86-64 too.
At the same time eliminate the redundant inline assembly in the C
file, and instead use the inline functions coming from the header.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- 2010-05-04.orig/xen/arch/x86/string.c	2007-11-26 16:57:03.000000000 +0100
+++ 2010-05-04/xen/arch/x86/string.c	2010-05-11 09:45:27.000000000 +0200
@@ -11,44 +11,13 @@
 #undef memcpy
 void *memcpy(void *dest, const void *src, size_t n)
 {
-    long d0, d1, d2;
-
-    asm volatile (
-#ifdef __i386__
-        "   rep movsl        ; "
-#else
-        "   rep movsq        ; "
-        "   testb $4,%b4     ; "
-        "   je 0f            ; "
-        "   movsl            ; "
-        "0:                  ; "
-#endif
-        "   testb $2,%b4     ; "
-        "   je 1f            ; "
-        "   movsw            ; "
-        "1: testb $1,%b4     ; "
-        "   je 2f            ; "
-        "   movsb            ; "
-        "2:                    "
-        : "=&c" (d0), "=&D" (d1),
"=&S" (d2)
-        : "0" (n/sizeof(long)), "q" (n), "1"
(dest), "2" (src)
-        : "memory");
-
-    return dest;
+    return __variable_memcpy(dest, src, n);
 }
 
 #undef memset
 void *memset(void *s, int c, size_t n)
 {
-    long d0, d1;
-
-    asm volatile (
-        "rep stosb"
-        : "=&c" (d0), "=&D" (d1)
-        : "a" (c), "1" (s), "0" (n)
-        : "memory");
-
-    return s;
+    return __memset_generic(s, c, n);
 }
 
 #undef memmove
--- 2010-05-04.orig/xen/include/asm-x86/string.h	2009-10-07 13:31:36.000000000
+0200
+++ 2010-05-04/xen/include/asm-x86/string.h	2010-05-11 10:21:04.000000000 +0200
@@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
     return to;
 }
 
+#define __HAVE_ARCH_MEMCPY
+#if defined(__i386__) && __GNUC__ >= 4
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
+
 /*
  * This looks horribly ugly, but the compiler can optimize it totally,
  * as the count is constant.
@@ -95,7 +100,6 @@ static always_inline void * __constant_m
     return to;
 }
 
-#define __HAVE_ARCH_MEMCPY
 /* align source to a 64-bit boundary */
 static always_inline
 void *__var_memcpy(void *t, const void *f, size_t n)
@@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
             __var_memcpy((t),(f),(n)));
 }
 
+#endif /* !__i386__ || __GNUC__ < 4 */
+
 /* Some version of gcc don''t have this builtin. It''s
non-critical anyway. */
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *dest, const void *src, size_t n);
 
-static inline void *__memset_generic(void *s, char c, size_t count)
+static inline void *__memset_generic(void *s, int c, size_t count)
 {
     long d0, d1;
     __asm__ __volatile__ (
@@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
     return s;
 }
 
+#define __HAVE_ARCH_MEMSET
+#if defined(__i386__) && __GNUC__ >= 4
+#define memset(s, c, n) __builtin_memset(s, c, n)
+#else
+
 /* we might want to write optimized versions of these later */
 #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
 
@@ -238,11 +249,12 @@ static always_inline void *__constant_c_
 #define MEMSET_PATTERN_MUL 0x01010101UL
 #endif
 
-#define __HAVE_ARCH_MEMSET
 #define memset(s, c, count) (__memset((s),(c),(count)))
 #define __memset(s, c, count) \
 (__builtin_constant_p(c) ? \
  __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
  __var_x_memset((s),(c),(count)))
 
+#endif /* !__i386__ || __GNUC__ < 4 */
+
 #endif /* __X86_STRING_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
Keir Fraser
2010-May-11  10:39 UTC
Re: [Xen-devel] [PATCH] x86-32: use __builtin_{memcpy,memset}
On 11/05/2010 10:05, "Jan Beulich" <JBeulich@novell.com> wrote:> Following a change in Linux 2.6.33, make x86-32 always use > __builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around > certain intermediate gcc revisions generating out-of-range-array-index > warnings with the current inline implementation. > > It may be worthwhile considering to make this the case for x86-64 too. > > At the same time eliminate the redundant inline assembly in the C > file, and instead use the inline functions coming from the header.Hm, well, I hate having to change this stuff as we always seem to end up broken on some gcc or other. But otoh this will eliminate a bunch of code if we do this unconditionally, and I''m particularly not keen on doing this only for x86-32 and particular versions of gcc. I suggest the attached patch: it should work fine so long as all our supported versions of gcc have __builtin_memcpy and __builtin_memset. Given we nowadays only support GCC 3.4+, I imagine we are okay in this regard. What do you think to this alternative patch? -- Keir> Signed-off-by: Jan Beulich <jbeulich@novell.com> > > --- 2010-05-04.orig/xen/arch/x86/string.c 2007-11-26 16:57:03.000000000 +0100 > +++ 2010-05-04/xen/arch/x86/string.c 2010-05-11 09:45:27.000000000 +0200 > @@ -11,44 +11,13 @@ > #undef memcpy > void *memcpy(void *dest, const void *src, size_t n) > { > - long d0, d1, d2; > - > - asm volatile ( > -#ifdef __i386__ > - " rep movsl ; " > -#else > - " rep movsq ; " > - " testb $4,%b4 ; " > - " je 0f ; " > - " movsl ; " > - "0: ; " > -#endif > - " testb $2,%b4 ; " > - " je 1f ; " > - " movsw ; " > - "1: testb $1,%b4 ; " > - " je 2f ; " > - " movsb ; " > - "2: " > - : "=&c" (d0), "=&D" (d1), "=&S" (d2) > - : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src) > - : "memory"); > - > - return dest; > + return __variable_memcpy(dest, src, n); > } > > #undef memset > void *memset(void *s, int c, size_t n) > { > - long d0, d1; > - > - asm volatile ( > - "rep stosb" > - : "=&c" (d0), "=&D" (d1) > - : "a" (c), "1" (s), "0" (n) > - : "memory"); > - > - return s; > + return __memset_generic(s, c, n); > } > > #undef memmove > --- 2010-05-04.orig/xen/include/asm-x86/string.h 2009-10-07 13:31:36.000000000 > +0200 > +++ 2010-05-04/xen/include/asm-x86/string.h 2010-05-11 10:21:04.000000000 > +0200 > @@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo > return to; > } > > +#define __HAVE_ARCH_MEMCPY > +#if defined(__i386__) && __GNUC__ >= 4 > +#define memcpy(t, f, n) __builtin_memcpy(t, f, n) > +#else > + > /* > * This looks horribly ugly, but the compiler can optimize it totally, > * as the count is constant. > @@ -95,7 +100,6 @@ static always_inline void * __constant_m > return to; > } > > -#define __HAVE_ARCH_MEMCPY > /* align source to a 64-bit boundary */ > static always_inline > void *__var_memcpy(void *t, const void *f, size_t n) > @@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s > __var_memcpy((t),(f),(n))); > } > > +#endif /* !__i386__ || __GNUC__ < 4 */ > + > /* Some version of gcc don''t have this builtin. It''s non-critical anyway. */ > #define __HAVE_ARCH_MEMMOVE > extern void *memmove(void *dest, const void *src, size_t n); > > -static inline void *__memset_generic(void *s, char c, size_t count) > +static inline void *__memset_generic(void *s, int c, size_t count) > { > long d0, d1; > __asm__ __volatile__ ( > @@ -134,6 +140,11 @@ static inline void *__memset_generic(voi > return s; > } > > +#define __HAVE_ARCH_MEMSET > +#if defined(__i386__) && __GNUC__ >= 4 > +#define memset(s, c, n) __builtin_memset(s, c, n) > +#else > + > /* we might want to write optimized versions of these later */ > #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count)) > > @@ -238,11 +249,12 @@ static always_inline void *__constant_c_ > #define MEMSET_PATTERN_MUL 0x01010101UL > #endif > > -#define __HAVE_ARCH_MEMSET > #define memset(s, c, count) (__memset((s),(c),(count))) > #define __memset(s, c, count) \ > (__builtin_constant_p(c) ? \ > __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : > \ > __var_x_memset((s),(c),(count))) > > +#endif /* !__i386__ || __GNUC__ < 4 */ > + > #endif /* __X86_STRING_H__ */ > > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2010-May-11  12:04 UTC
Re: [Xen-devel] [PATCH] x86-32: use __builtin_{memcpy,memset}
On 11/05/2010 11:39, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:> Hm, well, I hate having to change this stuff as we always seem to end up > broken on some gcc or other. But otoh this will eliminate a bunch of code if > we do this unconditionally, and I''m particularly not keen on doing this only > for x86-32 and particular versions of gcc. I suggest the attached patch: it > should work fine so long as all our supported versions of gcc have > __builtin_memcpy and __builtin_memset. Given we nowadays only support GCC > 3.4+, I imagine we are okay in this regard. > > What do you think to this alternative patch?Jan, What about this alternative version which distinguishes between GCC 4.3+ (which should auto-inline quite sensibly) and earlier versions (which need help via macro mapping explicitly to __builtin_*)? -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2010-May-11  12:33 UTC
Re: [Xen-devel] [PATCH] x86-32: use __builtin_{memcpy,memset}
>>> Keir Fraser <keir.fraser@eu.citrix.com> 11.05.10 14:04 >>> >What about this alternative version which distinguishes between GCC 4.3+ >(which should auto-inline quite sensibly) and earlier versions (which need >help via macro mapping explicitly to __builtin_*)?I don''t think this will do what we expect in the presence of -fno-builtin (see also my other response). Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel