Ilia Mirkin
2021-Mar-09 01:02 UTC
[Nouveau] [PATCH] xv: add MMX / SSE acceleration for YV12 -> YUYV repacking
This is used by the blit adaptor. Might as well try to accelerate it. When testing with it hacked to take effect for nvc0, saw, a decrease of NVPutImage usage in the X process from 68% -> 43% (MMX) -> 24% (SSE) (which is approximately a 7x speed-up to the function, assuming other parts remained equal). Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- I did some basic testing with a patch to force the texture path to do this conversion rather than to NV12, testing all 3 cases. However I need to do better testing of edge cases, which I will do before pushing. src/nouveau_xv.c | 94 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 14 deletions(-) diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c index b2d75c5..16aca93 100644 --- a/src/nouveau_xv.c +++ b/src/nouveau_xv.c @@ -25,7 +25,7 @@ #include "config.h" #endif -#ifdef __SSE2__ +#if defined(__SSE2__) || defined(__MMX__) #include <immintrin.h> #endif @@ -568,7 +568,7 @@ NVCopyData420(unsigned char *src1, unsigned char *src2, unsigned char *src3, { CARD32 *dst; CARD8 *s1, *s2, *s3; - int i, j; + int i, j, l, e; #define su(X) (((j & 1) && j < (h-1)) ? ((unsigned)((signed int)s2[X] + \ (signed int)(s2 + srcPitch2)[X]) / 2) : (s2[X])) @@ -576,29 +576,95 @@ NVCopyData420(unsigned char *src1, unsigned char *src2, unsigned char *src3, (signed int)(s3 + srcPitch2)[X]) / 2) : (s3[X])) w >>= 1; +#ifdef __MMX__ + l = w >> 3; + e = w & 7; +#else + l = w >> 2; + e = w & 3; +#endif for (j = 0; j < h; j++) { dst = (CARD32*)dst1; s1 = src1; s2 = src2; s3 = src3; i = w; - while (i > 4) { + for (i = 0; i < l; i++) { +#ifdef __MMX__ + __m64 mm_v = *(__m64 *)&s2[0]; + __m64 mm_u = *(__m64 *)&s3[0]; + + if (j & 1 && j < (h - 1)) { + __m64 mm_vnext = *(__m64 *)&(s2 + srcPitch2)[0]; +#ifdef __SSE__ + mm_v = _mm_avg_pu8(mm_v, mm_vnext); +#else /* __SSE__ */ + __m64 zero = _m_from_int(0); + /* make 16-bit wide values */ + __m64 mm_vnext16_1 = _mm_unpacklo_pi8(mm_vnext, zero); + __m64 mm_vnext16_2 = _mm_unpackhi_pi8(mm_vnext, zero); + __m64 mm_v16_1 = _mm_unpacklo_pi8(mm_v, zero); + __m64 mm_v16_2 = _mm_unpackhi_pi8(mm_v, zero); + /* add together */ + mm_v16_1 = _mm_add_pi16(mm_v16_1, mm_vnext16_1); + mm_v16_2 = _mm_add_pi16(mm_v16_2, mm_vnext16_2); + /* divide by 2 */ + mm_v16_1 = _mm_srli_pi16(mm_v16_1, 1); + mm_v16_2 = _mm_srli_pi16(mm_v16_2, 1); + /* put back into 8-bit values */ + mm_v = _mm_packs_pu16(mm_v16_1, mm_v16_2); +#endif + + /* repeat for u */ + __m64 mm_unext = *(__m64 *)&(s3 + srcPitch2)[0]; +#ifdef __SSE__ + mm_u = _mm_avg_pu8(mm_u, mm_unext); +#else /* __SSE__ */ + /* make 16-bit wide values */ + __m64 mm_unext16_1 = _mm_unpacklo_pi8(mm_unext, zero); + __m64 mm_unext16_2 = _mm_unpackhi_pi8(mm_unext, zero); + __m64 mm_u16_1 = _mm_unpacklo_pi8(mm_u, zero); + __m64 mm_u16_2 = _mm_unpackhi_pi8(mm_u, zero); + /* add together */ + mm_u16_1 = _mm_add_pi16(mm_u16_1, mm_unext16_1); + mm_u16_2 = _mm_add_pi16(mm_u16_2, mm_unext16_2); + /* divide by 2 */ + mm_u16_1 = _mm_srli_pi16(mm_u16_1, 1); + mm_u16_2 = _mm_srli_pi16(mm_u16_2, 1); + /* put back into 8-bit values */ + mm_u = _mm_packs_pu16(mm_u16_1, mm_u16_2); +#endif + } + + __m64 mm_y1 = *(__m64 *)s1; + __m64 mm_y2 = *(__m64 *)&s1[8]; + + __m64 mm_uv1 = _mm_unpacklo_pi8(mm_u, mm_v); + __m64 mm_uv2 = _mm_unpackhi_pi8(mm_u, mm_v); + + *(__m64 *)&dst[0] = _mm_unpacklo_pi8(mm_y1, mm_uv1); + *(__m64 *)&dst[2] = _mm_unpackhi_pi8(mm_y1, mm_uv1); + *(__m64 *)&dst[4] = _mm_unpacklo_pi8(mm_y2, mm_uv2); + *(__m64 *)&dst[6] = _mm_unpackhi_pi8(mm_y2, mm_uv2); + + dst += 8; s2 += 8; s3 += 8; s1 += 16; +#else /* __MMX__ */ #if X_BYTE_ORDER == X_BIG_ENDIAN - dst[0] = (s1[0] << 24) | (s1[1] << 8) | (sv(0) << 16) | su(0); - dst[1] = (s1[2] << 24) | (s1[3] << 8) | (sv(1) << 16) | su(1); - dst[2] = (s1[4] << 24) | (s1[5] << 8) | (sv(2) << 16) | su(2); - dst[3] = (s1[6] << 24) | (s1[7] << 8) | (sv(3) << 16) | su(3); + dst[0] = (s1[0] << 24) | (s1[1] << 8) | (sv(0) << 16) | su(0); + dst[1] = (s1[2] << 24) | (s1[3] << 8) | (sv(1) << 16) | su(1); + dst[2] = (s1[4] << 24) | (s1[5] << 8) | (sv(2) << 16) | su(2); + dst[3] = (s1[6] << 24) | (s1[7] << 8) | (sv(3) << 16) | su(3); #else - dst[0] = s1[0] | (s1[1] << 16) | (sv(0) << 8) | (su(0) << 24); - dst[1] = s1[2] | (s1[3] << 16) | (sv(1) << 8) | (su(1) << 24); - dst[2] = s1[4] | (s1[5] << 16) | (sv(2) << 8) | (su(2) << 24); - dst[3] = s1[6] | (s1[7] << 16) | (sv(3) << 8) | (su(3) << 24); + dst[0] = s1[0] | (s1[1] << 16) | (sv(0) << 8) | (su(0) << 24); + dst[1] = s1[2] | (s1[3] << 16) | (sv(1) << 8) | (su(1) << 24); + dst[2] = s1[4] | (s1[5] << 16) | (sv(2) << 8) | (su(2) << 24); + dst[3] = s1[6] | (s1[7] << 16) | (sv(3) << 8) | (su(3) << 24); #endif - dst += 4; s2 += 4; s3 += 4; s1 += 8; - i -= 4; + dst += 4; s2 += 4; s3 += 4; s1 += 8; +#endif /* __MMX__ */ } - while (i--) { + for (i = 0; i < e; i++) { #if X_BYTE_ORDER == X_BIG_ENDIAN dst[0] = (s1[0] << 24) | (s1[1] << 8) | (sv(0) << 16) | su(0); #else -- 2.26.2