thr3ads.net - Nouveau - [Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available [Jul 2013]

If this information is useful, please help other people find it:
Share via:

Ilia Mirkin

2013-Jul-29 06:40 UTC

[Nouveau] [PATCH 1/2] xv: fix last pixel for big-endian machines in YV12 -> NV12 conversion

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/nouveau_xv.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c
index 8eafcf0..567e30c 100644
--- a/src/nouveau_xv.c
+++ b/src/nouveau_xv.c
@@ -552,8 +552,11 @@ NVCopyNV12ColorPlanes(unsigned char *src1, unsigned char
*src2,
 
 		if (e) {
 			unsigned short *vud = (unsigned short *) vuvud;
-
+#if X_BYTE_ORDER == X_BIG_ENDIAN
+			*vud = us[0] | (vs[0]<<8);
+#else
 			*vud = vs[0] | (us[0]<<8);
+#endif
 		}
 
 		dst += dstPitch;
-- 
1.8.1.5

Ilia Mirkin

2013-Jul-29 06:40 UTC

head link

[Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

memcpy() goes from taking 45% to 66% of total function time, which
translates to a 30% decrease in NVPutImage runtime.

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/nouveau_xv.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c
index 567e30c..5569b7c 100644
--- a/src/nouveau_xv.c
+++ b/src/nouveau_xv.c
@@ -25,6 +25,8 @@
 #include "config.h"
 #endif
 
+#include <immintrin.h>
+
 #include "xf86xv.h"
 #include <X11/extensions/Xv.h>
 #include "exa.h"
@@ -532,30 +534,47 @@ NVCopyNV12ColorPlanes(unsigned char *src1, unsigned char
*src2,
 
 	w >>= 1;
 	h >>= 1;
+#ifdef __SSE2__
+	l = w >> 3;
+	e = w & 7;
+#else
 	l = w >> 1;
 	e = w & 1;
+#endif
 
 	for (j = 0; j < h; j++) {
 		unsigned char *us = src1;
 		unsigned char *vs = src2;
 		unsigned int *vuvud = (unsigned int *) dst;
+		unsigned short *vud;
 
 		for (i = 0; i < l; i++) {
-#if X_BYTE_ORDER == X_BIG_ENDIAN
+#ifdef __SSE2__
+			_mm_storeu_si128(
+				(void*)vuvud,
+				_mm_unpacklo_epi8(
+					_mm_loadl_epi64((void*)vs),
+					_mm_loadl_epi64((void*)us)));
+			vuvud+=4;
+			us+=8;
+			vs+=8;
+#else /* __SSE2__ */
+#  if X_BYTE_ORDER == X_BIG_ENDIAN
 			*vuvud++ = (vs[0]<<24) | (us[0]<<16) | (vs[1]<<8) | us[1];
-#else
+#  else
 			*vuvud++ = vs[0] | (us[0]<<8) | (vs[1]<<16) | (us[1]<<24);
-#endif
+#  endif
 			us+=2;
 			vs+=2;
+#endif /* __SSE2__ */
 		}
 
-		if (e) {
-			unsigned short *vud = (unsigned short *) vuvud;
+		vud = (unsigned short *)vuvud;
+		for (i = 0; i < e; i++) {
 #if X_BYTE_ORDER == X_BIG_ENDIAN
-			*vud = us[0] | (vs[0]<<8);
+			vud[i] = us[i] | (vs[i]<<8);
 #else
-			*vud = vs[0] | (us[0]<<8);
+			vud[i] = vs[i] | (us[i]<<8);
 #endif
 		}
 
-- 
1.8.1.5

Sven Joachim

2013-Jul-31 17:16 UTC

head link

[Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

On 2013-07-29 08:40 +0200, Ilia Mirkin wrote:
> memcpy() goes from taking 45% to 66% of total function time, which
> translates to a 30% decrease in NVPutImage runtime.
>
> Signed-off-by: Ilia Mirkin <imirkin-FrUbXkNCsVf2fBVCVOL8/A at
public.gmane.org>
> ---
>  src/nouveau_xv.c | 33 ++++++++++++++++++++++++++-------
>  1 file changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c
> index 567e30c..5569b7c 100644
> --- a/src/nouveau_xv.c
> +++ b/src/nouveau_xv.c
> @@ -25,6 +25,8 @@
>  #include "config.h"
>  #endif
>  
> +#include <immintrin.h>
> +
Unfortunately, immintrin.h is not available on most architectures,
leading to build failures as can be seen on
https://buildd.debian.org/status/package.php?p=xserver-xorg-video-nouveau.

Any ideas?
>  #include "xf86xv.h"
>  #include <X11/extensions/Xv.h>
>  #include "exa.h"
> @@ -532,30 +534,47 @@ NVCopyNV12ColorPlanes(unsigned char *src1, unsigned
char *src2,
>  
>  	w >>= 1;
>  	h >>= 1;
> +#ifdef __SSE2__
> +	l = w >> 3;
> +	e = w & 7;
> +#else
>  	l = w >> 1;
>  	e = w & 1;
> +#endif
>  
>  	for (j = 0; j < h; j++) {
>  		unsigned char *us = src1;
>  		unsigned char *vs = src2;
>  		unsigned int *vuvud = (unsigned int *) dst;
> +		unsigned short *vud;
>  
>  		for (i = 0; i < l; i++) {
> -#if X_BYTE_ORDER == X_BIG_ENDIAN
> +#ifdef __SSE2__
> +			_mm_storeu_si128(
> +				(void*)vuvud,
> +				_mm_unpacklo_epi8(
> +					_mm_loadl_epi64((void*)vs),
> +					_mm_loadl_epi64((void*)us)));
> +			vuvud+=4;
> +			us+=8;
> +			vs+=8;
> +#else /* __SSE2__ */
> +#  if X_BYTE_ORDER == X_BIG_ENDIAN
>  			*vuvud++ = (vs[0]<<24) | (us[0]<<16) | (vs[1]<<8) |
us[1];
> -#else
> +#  else
>  			*vuvud++ = vs[0] | (us[0]<<8) | (vs[1]<<16) |
(us[1]<<24);
> -#endif
> +#  endif
>  			us+=2;
>  			vs+=2;
> +#endif /* __SSE2__ */
>  		}
>  
> -		if (e) {
> -			unsigned short *vud = (unsigned short *) vuvud;
> +		vud = (unsigned short *)vuvud;
> +		for (i = 0; i < e; i++) {
>  #if X_BYTE_ORDER == X_BIG_ENDIAN
> -			*vud = us[0] | (vs[0]<<8);
> +			vud[i] = us[i] | (vs[i]<<8);
>  #else
> -			*vud = vs[0] | (us[0]<<8);
> +			vud[i] = vs[i] | (us[i]<<8);
>  #endif
>  		}
>  
> -- 
> 1.8.1.5
Cheers,
       Sven

Sven Joachim

2013-Jul-31 17:28 UTC

head link

[Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

On 2013-07-31 19:18 +0200, Ilia Mirkin wrote:
> On Wed, Jul 31, 2013 at 1:16 PM, Sven Joachim <svenjoac at gmx.de>
wrote:
>>
>> Unfortunately, immintrin.h is not available on most architectures,
>> leading to build failures as can be seen on
>>
https://buildd.debian.org/status/package.php?p=xserver-xorg-video-nouveau.
>
> Sorry :( I thought that immintrin.h would be available everywhere and
> just end up empty since none of the __SSE*__ would be defined. I was
> wrong.
>
>>
>> Any ideas?
>
> A fix is checked into master already:
>
http://cgit.freedesktop.org/nouveau/xf86-video-nouveau/commit/?id=1df177f35a05db505577cdc929e63fde906a704b
Ah, good to hear and sorry that I didn't check myself.  I'll merge that
commit into the Debian package.

Cheers,
       Sven

Possibly Parallel Threads

Search for more possibly parallel threads

Nouveau - Jul 2013 - [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

[Nouveau] [PATCH 1/2] xv: fix last pixel for big-endian machines in YV12 -> NV12 conversion

[Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

[Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

[Nouveau] [PATCH 2/2] xv: speed up YV12 -> NV12 conversion using SSE2 if available

Possibly Parallel Threads