Lovely. I actually get up to about 4x speed improvement
on some encodes.
Here's a patch that is ported to SVN HEAD.
--Adam
-------------- next part --------------
Index: lib/reconstruct.c
==================================================================---
lib/reconstruct.c (revision 7621)
+++ lib/reconstruct.c (working copy)
@@ -16,12 +16,28 @@
********************************************************************/
#include "encoder_internal.h"
+#include "dsp.h"
+#include "cpu.h"
-void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void copy8x8__c (unsigned char *src,
+ unsigned char *dest,
+ unsigned int stride)
+{
+ int j;
+ for ( j = 0; j < 8; j++ ){
+ ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
+ ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
+ src+=stride;
+ dest+=stride;
+ }
+}
+
+static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
ogg_uint32_t i;
- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+ for (i = 8; i; i--){
/* Convert the data back to 8 bit unsigned */
/* Saturate the output to unsigend 8 bit values */
ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
@@ -34,17 +50,16 @@
ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
ReconPtr += LineStep;
- ChangePtr += BLOCK_HEIGHT_WIDTH;
+ ChangePtr += 8;
}
-
}
-void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- unsigned char * RefPtr, ogg_int16_t * ChangePtr,
- ogg_uint32_t LineStep ) {
+static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
ogg_uint32_t i;
- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {
+ for (i = 8; i; i--){
ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
@@ -54,19 +69,19 @@
ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
- ChangePtr += BLOCK_HEIGHT_WIDTH;
+ ChangePtr += 8;
ReconPtr += LineStep;
RefPtr += LineStep;
}
-
}
-void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- unsigned char * RefPtr1, unsigned char * RefPtr2,
- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char
*RefPtr1,
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
ogg_uint32_t i;
- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+ for (i = 8; i; i--){
ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) +
ChangePtr[0] );
ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) +
ChangePtr[1] );
ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) +
ChangePtr[2] );
@@ -76,10 +91,20 @@
ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) +
ChangePtr[6] );
ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) +
ChangePtr[7] );
- ChangePtr += BLOCK_HEIGHT_WIDTH;
+ ChangePtr += 8;
ReconPtr += LineStep;
RefPtr1 += LineStep;
RefPtr2 += LineStep;
}
+}
+void dsp_recon_init (DspFunctions *funcs)
+{
+ funcs->copy8x8 = copy8x8__c;
+ funcs->recon_intra8x8 = recon_intra8x8__c;
+ funcs->recon_inter8x8 = recon_inter8x8__c;
+ funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+ if (cpu_flags & CPU_X86_MMX) {
+ dsp_i386_mmx_recon_init(&dsp_funcs);
+ }
}
Index: lib/dct_encode.c
==================================================================---
lib/dct_encode.c (revision 7621)
+++ lib/dct_encode.c (working copy)
@@ -17,110 +17,10 @@
#include <stdlib.h>
#include "encoder_internal.h"
+#include "dsp.h"
static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
-static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
- ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
- unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine ) {
- int i;
-
- /* For each block row */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
- DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
- DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
- DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
- DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
- DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
- DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
- DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
- DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
-
- /* Update the screen canvas in one step*/
- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
- /* Start next row */
- new_ptr1 += PixelsPerLine;
- old_ptr1 += PixelsPerLine;
- FiltPtr += PixelsPerLine;
- ReconPtr += ReconPixelsPerLine;
- DctInputPtr += BLOCK_HEIGHT_WIDTH;
- }
-}
-
-static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- unsigned char *old_ptr1, unsigned char *new_ptr1,
- ogg_uint32_t PixelsPerLine ) {
- int i;
- /* For each block row */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
- /* INTRA mode so code raw image data */
- /* We convert the data to 8 bit signed (by subtracting 128) as
- this reduces the internal precision requirments in the DCT
- transform. */
- DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
- DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
- DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
- DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
- DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
- DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
- DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
- DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
-
- /* Update the screen canvas in one step */
- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
- /* Start next row */
- new_ptr1 += PixelsPerLine;
- old_ptr1 += PixelsPerLine;
- FiltPtr += PixelsPerLine;
- DctInputPtr += BLOCK_HEIGHT_WIDTH;
- }
-}
-
-static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
- unsigned char *old_ptr1, unsigned char *new_ptr1,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine ) {
- int i;
-
- /* For each block row */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
- DctInputPtr[0] = (ogg_int16_t)
- ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
- DctInputPtr[1] = (ogg_int16_t)
- ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
- DctInputPtr[2] = (ogg_int16_t)
- ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
- DctInputPtr[3] = (ogg_int16_t)
- ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
- DctInputPtr[4] = (ogg_int16_t)
- ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
- DctInputPtr[5] = (ogg_int16_t)
- ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
- DctInputPtr[6] = (ogg_int16_t)
- ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
- DctInputPtr[7] = (ogg_int16_t)
- ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
-
- /* Update the screen canvas in one step */
- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
- /* Start next row */
- new_ptr1 += PixelsPerLine;
- old_ptr1 += PixelsPerLine;
- FiltPtr += PixelsPerLine;
- ReconPtr1 += ReconPixelsPerLine;
- ReconPtr2 += ReconPixelsPerLine;
- DctInputPtr += BLOCK_HEIGHT_WIDTH;
- }
-}
-
static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
ogg_uint32_t * TokenListPtr ){
unsigned char tokens_added = 0;
@@ -452,13 +352,15 @@
/* Is the MV offset exactly pixel alligned */
if ( AbsRefOffset == 0 ){
- Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
- PixelsPerLine, ReconPixelsPerLine );
+ dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
+ PixelsPerLine, ReconPixelsPerLine);
+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
} else {
/* Fractional pixel MVs. */
/* Note that we only use two pixel values even for the diagonal */
- Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,
- new_ptr1, PixelsPerLine, ReconPixelsPerLine );
+ dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
+ PixelsPerLine, ReconPixelsPerLine);
+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
}
}
@@ -534,17 +436,18 @@
pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
}
- Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
- PixelsPerLine, ReconPixelsPerLine );
+ dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
+ PixelsPerLine, ReconPixelsPerLine);
+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
} else if ( cpi->pb.CodingMode==CODE_INTRA ) {
- Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);
-
+ dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine);
+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
}
/* Proceed to encode the data into the encode buffer if the encoder
is enabled. */
/* Perform a 2D DCT transform on the data. */
- fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
+ dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
/* Quantize that transform data. */
quantize ( &cpi->pb, cpi->DCT_codes,
cpi->pb.QFragData[FragIndex] );
Index: lib/cpu.c
==================================================================--- lib/cpu.c
(revision 0)
+++ lib/cpu.c (revision 0)
@@ -0,0 +1,107 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+ogg_uint32_t cpu_flags = 0;
+
+#if 1
+static ogg_uint32_t cpu_get_flags (void)
+{
+ ogg_uint32_t eax, ebx, ecx, edx;
+ ogg_uint32_t flags;
+
+#define cpuid(op,eax,ebx,ecx,edx) \
+ asm volatile ("pushl %%ebx \n\t" \
+ "cpuid \n\t" \
+ "movl %%ebx,%1 \n\t" \
+ "popl %%ebx" \
+ : "=a" (eax), \
+ "=r" (ebx), \
+ "=c" (ecx), \
+ "=d" (edx) \
+ : "a" (op) \
+ : "cc")
+
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0,%1 \n\t"
+ "xorl $0x200000,%0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl"
+ : "=r" (eax),
+ "=r" (ebx)
+ :
+ : "cc");
+
+ if (eax == ebx) /* no cpuid */
+ return 0;
+
+ cpuid(0, eax, ebx, ecx, edx);
+
+ if (ebx == 0x756e6547 &&
+ edx == 0x49656e69 &&
+ ecx == 0x6c65746e) {
+ /* intel */
+
+ inteltest:
+ cpuid(1, eax, ebx, ecx, edx);
+ if ((edx & 0x00800000) == 0)
+ return 0;
+ flags = CPU_X86_MMX;
+ if (edx & 0x02000000)
+ flags |= CPU_X86_MMXEXT | CPU_X86_SSE;
+ if (edx & 0x04000000)
+ flags |= CPU_X86_SSE2;
+ return flags;
+ } else if (ebx == 0x68747541 &&
+ edx == 0x69746e65 &&
+ ecx == 0x444d4163) {
+ /* AMD */
+ cpuid(0x80000000, eax, ebx, ecx, edx);
+ if ((unsigned)eax < 0x80000001)
+ goto inteltest;
+ cpuid(0x80000001, eax, ebx, ecx, edx);
+ if ((edx & 0x00800000) == 0)
+ return 0;
+ flags = CPU_X86_MMX;
+ if (edx & 0x80000000)
+ flags |= CPU_X86_3DNOW;
+ if (edx & 0x00400000)
+ flags |= CPU_X86_MMXEXT;
+ return flags;
+ }
+ else {
+ /* implement me */
+ }
+
+ return flags;
+}
+#else
+static ogg_uint32_t cpu_get_flags (void) {
+ return 0;
+}
+#endif
+
+void cpu_init ()
+{
+ cpu_flags = cpu_get_flags();
+}
Index: lib/cpu.h
==================================================================--- lib/cpu.h
(revision 0)
+++ lib/cpu.h (revision 0)
@@ -0,0 +1,28 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include "encoder_internal.h"
+
+extern ogg_uint32_t cpu_flags;
+
+#define CPU_X86_MMX (1<<0)
+#define CPU_X86_3DNOW (1<<1)
+#define CPU_X86_MMXEXT (1<<2)
+#define CPU_X86_SSE (1<<3)
+#define CPU_X86_SSE2 (1<<4)
+
+void cpu_init () ;
Index: lib/i386/fdct_mmx.c
==================================================================---
lib/i386/fdct_mmx.c (revision 0)
+++ lib/i386/fdct_mmx.c (revision 0)
@@ -0,0 +1,340 @@
+;//=========================================================================+;//
+;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY
OF ANY
+;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
+;// PURPOSE.
+;//
+;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
+;//
+;//--------------------------------------------------------------------------
+
+#include <theora/theora.h>
+#include "dsp.h"
+
+static const __attribute__ ((aligned(8))) ogg_int64_t xC1S7 =
0x0fb15fb15fb15fb15LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC2S6 =
0x0ec83ec83ec83ec83LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC3S5 =
0x0d4dbd4dbd4dbd4dbLL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC4S4 =
0x0b505b505b505b505LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC5S3 =
0x08e3a8e3a8e3a8e3aLL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC6S2 =
0x061f861f861f861f8LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC7S1 =
0x031f131f131f131f1LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+/***********************************************************************
+ * File: fdct_m.asm
+ *
+ * Description:
+ * This function perform 2-D Forward DCT on a 8x8 block
+ *
+ *
+ * Input: Pointers to input source data buffer and destination
+ * buffer.
+ *
+ * Note: none
+ *
+ * Special Notes: We try to do the truncation right to match the result
+ * of the c version.
+ *
+ ************************************************************************/
+
+/* execute stage 1 of forward DCT */
+#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
+ " movq " #ip0 ", %%mm0 \n\t"
\
+ " movq " #ip1 ", %%mm1 \n\t"
\
+ " movq " #ip3 ", %%mm2 \n\t"
\
+ " movq " #ip5 ", %%mm3 \n\t"
\
+ " movq %%mm0, %%mm4 \n\t"
\
+ " movq %%mm1, %%mm5 \n\t"
\
+ " movq %%mm2, %%mm6 \n\t"
\
+ " movq %%mm3, %%mm7 \n\t"
\
+ \
+ " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 +
ip7 = is07 */ \
+ " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 +
ip2 = is12 */ \
+ " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 +
ip4 = is34 */ \
+ " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 +
ip6 = is56 */ \
+ " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 -
ip7 = id07 */ \
+ " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 -
ip2 = id12 */ \
+ \
+ " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */
\
+ \
+ " paddsw %%mm2, %%mm2 \n\t"
\
+ \
+ " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 -
ip4 = id34 */ \
+ \
+ " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 =
is0734 */ \
+ " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */
\
+ " movq %%mm0," #temp " \n\t" /* Save is07 -
is34 to free mm0; */ \
+ " paddsw %%mm3, %%mm3 \n\t"
\
+ " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 =
is1256 */ \
+ \
+ " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 -
ip6 = id56 */ \
+ /* ------------------------------------------------------------------- */ \
+ " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */
\
+ " paddsw %%mm7, %%mm7 \n\t"
\
+ " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */
\
+ /* ------------------------------------------------------------------- */ \
+ " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */
\
+ " paddsw %%mm3, %%mm3 \n\t"
\
+ \
+ " movq %%mm2, %%mm0 \n\t" /* make a copy */
\
+ " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */
\
+ \
+ " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 *
( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
+ " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 -
is1256 ) */ \
+ " psrlw $15, %%mm2 \n\t"
\
+ " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is
op[4] */ \
+ \
+ " movq %%mm3, %%mm2 \n\t"
\
+ " movq %%mm0," #ip4 " \n\t" /* save ip4, now
mm0,mm2 are free */ \
+ \
+ " movq %%mm3, %%mm0 \n\t"
\
+ " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 *
( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
+ \
+ " psrlw $15, %%mm2 \n\t"
\
+ " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734
+is1256 ) */ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is
op[0] */ \
+ \
+ " movq %%mm3," #ip0 " \n\t"
\
+ /* ------------------------------------------------------------------- */ \
+ " movq " #temp ", %%mm3 \n\t" /* mm3 =
irot_input_y */ \
+ " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 *
irot_input_y - irot_input_y */ \
+ \
+ " movq " #temp ", %%mm2 \n\t"
\
+ " movq %%mm2, %%mm0 \n\t"
\
+ \
+ " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 *
irot_input_y */ \
+ " paddw %%mm0, %%mm3 \n\t"
\
+ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */
\
+ " movq %%mm5, %%mm0 \n\t"
\
+ \
+ " movq %%mm5, %%mm2 \n\t"
\
+ " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 *
irot_input_x */ \
+ \
+ " psrlw $15, %%mm2 \n\t"
\
+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */
\
+ \
+ " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */
\
+ " movq %%mm3," #ip2 " \n\t" /* Save ip2 */
\
+ \
+ " movq %%mm5, %%mm0 \n\t"
\
+ " movq %%mm5, %%mm2 \n\t"
\
+ \
+ " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 *
irot_input_x - irot_input_x */ \
+ " psrlw $15, %%mm2 \n\t"
\
+ \
+ " movq " #temp ", %%mm3 \n\t"
\
+ " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 *
irot_input_x */ \
+ \
+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */
\
+ " movq %%mm3, %%mm2 \n\t"
\
+ \
+ " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 *
irot_input_y */ \
+ " psrlw $15, %%mm2 \n\t"
\
+ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */
\
+ " psubsw %%mm5, %%mm3 \n\t"
\
+ \
+ " movq %%mm3," #ip6 " \n\t"
\
+ /* ------------------------------------------------------------------- */ \
+ " movq "M(xC4S4)", %%mm0 \n\t"
\
+ " movq %%mm1, %%mm2 \n\t"
\
+ " movq %%mm1, %%mm3 \n\t"
\
+ \
+ " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 -
is56 ) - ( is12 - is56 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 -
is56 ) */ \
+ " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is
icommon_product1 */ \
+ \
+ " movq %%mm7, %%mm2 \n\t"
\
+ " movq %%mm7, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 +
id56 ) - ( id12 + id56 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 +
id56 ) */ \
+ " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is
icommon_product2 */ \
+ /* ------------------------------------------------------------------- */ \
+ " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */
\
+ " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */
\
+ \
+ " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 +
idcommon_product2 ) */ \
+ " paddsw %%mm6, %%mm6 \n\t"
\
+ " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 -
icommon_product2 */ \
+ \
+ " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 -
icommon_product1 */ \
+ " paddsw %%mm1, %%mm1 \n\t"
\
+ " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 +
icommon_product1 */ \
+ /* ------------------------------------------------------------------- */ \
+ " movq "M(xC1S7)", %%mm7 \n\t"
\
+ " movq %%mm1, %%mm2 \n\t"
\
+ \
+ " movq %%mm1, %%mm3 \n\t"
\
+ " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 *
irot_input_x - irot_input_x */ \
+ \
+ " movq "M(xC7S1)", %%mm7 \n\t"
\
+ " psrlw $15, %%mm2 \n\t"
\
+ \
+ " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 *
irot_input_x */ \
+ " paddw %%mm2, %%mm1 \n\t" /* Trucated */
\
+ \
+ " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 *
irot_input_x */ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */
\
+ \
+ " movq %%mm0, %%mm5 \n\t"
\
+ " movq %%mm0, %%mm2 \n\t"
\
+ \
+ " movq "M(xC1S7)", %%mm7 \n\t"
\
+ " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 *
irot_input_y - irot_input_y */ \
+ \
+ " movq "M(xC7S1)", %%mm7 \n\t"
\
+ " psrlw $15, %%mm2 \n\t"
\
+ \
+ " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 *
irot_input_y */ \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */
\
+ \
+ " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 *
irot_input_y */ \
+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */
\
+ \
+ " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 *
irot_input_x - xC7S1 * irot_input_y = ip1 */ \
+ " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 *
irot_input_x - xC1S7 * irot_input_y = ip7 */ \
+ \
+ " movq %%mm1," #ip1 " \n\t"
\
+ " movq %%mm3," #ip7 " \n\t"
\
+ /* ------------------------------------------------------------------- */ \
+ " movq "M(xC3S5)", %%mm0 \n\t"
\
+ " movq "M(xC5S3)", %%mm1 \n\t"
\
+ \
+ " movq %%mm6, %%mm5 \n\t"
\
+ " movq %%mm6, %%mm7 \n\t"
\
+ \
+ " movq %%mm4, %%mm2 \n\t"
\
+ " movq %%mm4, %%mm3 \n\t"
\
+ \
+ " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 *
irot_input_x - irot_input_x */ \
+ " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 *
irot_input_y - irot_input_y */ \
+ \
+ " psrlw $15, %%mm2 \n\t"
\
+ " psrlw $15, %%mm5 \n\t"
\
+ \
+ " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 *
irot_input_x */ \
+ " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 *
irot_input_y */ \
+ \
+ " paddw %%mm2, %%mm4 \n\t" /* Truncated */
\
+ " paddw %%mm5, %%mm6 \n\t" /* Truncated */
\
+ \
+ " psubsw %%mm6, %%mm4 \n\t" /* ip3 */
\
+ " movq %%mm4," #ip3 " \n\t"
\
+ \
+ " movq %%mm3, %%mm4 \n\t"
\
+ " movq %%mm7, %%mm6 \n\t"
\
+ \
+ " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 *
irot_input_x - irot_input_x */ \
+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 *
irot_input_y - irot_input_y */ \
+ \
+ " paddw %%mm2, %%mm4 \n\t"
\
+ " paddw %%mm5, %%mm6 \n\t"
\
+ \
+ " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 *
irot_input_x */ \
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 *
irot_input_y */ \
+ \
+ " paddw %%mm7, %%mm3 \n\t" /* ip5 */
\
+ " movq %%mm3," #ip5 " \n\t"
+
+#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
+ op0,op1,op2,op3,op4,op5,op6,op7) \
+ " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2
a3 */ \
+ " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6
e7 */ \
+ " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2
b3 */ \
+ " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6
f7 */ \
+ " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2
c3 */ \
+ " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6
g7 */ \
+ " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2
d3 */ \
+ " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2
b3 */ \
+ " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2
h3 */ \
+ /* Transpose 2x8 block */ \
+ " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */
\
+ " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */
\
+ " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1
a0 */ \
+ " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */
\
+ " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */
\
+ " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */
\
+ " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */
\
+ " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4
*/ \
+ " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5
*/ \
+ " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */
\
+ " movq %%mm4," #op4 " \n\t"
\
+ " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */
\
+ " movq %%mm5," #op5 " \n\t"
\
+ " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7
*/ \
+ " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1
a0 */ \
+ " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6
*/ \
+ " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1
b0 */ \
+ " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */
\
+ " movq %%mm6," #op7 " \n\t"
\
+ " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */
\
+ " movq %%mm1," #op6 " \n\t"
\
+ " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */
\
+ " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */
\
+ " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */
\
+ " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */
\
+ " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0
*/ \
+ " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1
*/ \
+ " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */
\
+ " movq %%mm0," #op0 " \n\t"
\
+ " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */
\
+ " movq %%mm1," #op1 " \n\t"
\
+ " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3
*/ \
+ " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2
*/ \
+ " movq %%mm4," #op3 " \n\t"
\
+ " movq %%mm2," #op2 " \n\t"
+
+
+static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
+{
+ ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
+ ogg_int16_t *const temp= (int16_t*)align_tmp;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ /*
+ * Input data is an 8x8 block. To make processing of the data more
efficent
+ * we will transpose the block of data to two 4x8 blocks???
+ */
+ Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0),
56(%0),
+ (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
+ Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1),
56(%1), (%2))
+
+ Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0),
88(%0),104(%0),120(%0),
+ 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+ Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1),
88(%1),104(%1),120(%1), (%2))
+
+ Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1),
96(%1),112(%1),
+ 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+ Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1),
96(%1),112(%1), (%2))
+
+ Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1),
88(%1),104(%1),120(%1),
+ 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+ Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1),
88(%1),104(%1),120(%1), (%2))
+
+ " emms \n\t"
+
+ : "+r" (InputData),
+ "+r" (OutputData)
+ : "r" (temp)
+ : "memory"
+ );
+}
+
+void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
+{
+ funcs->fdct_short = fdct_short__mmx;
+}
Index: lib/i386/dsp_mmx.c
==================================================================---
lib/i386/dsp_mmx.c (revision 0)
+++ lib/i386/dsp_mmx.c (revision 0)
@@ -0,0 +1,642 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include "dsp.h"
+
+static const __attribute__ ((aligned(8))) ogg_int64_t V128 =
0x0080008000800080LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+
+static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up
conversion */
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up
conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr
*/
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr
*/
+ " movq %%mm0, (%2) \n\t" /* write answer out */
+ " movq %%mm2, 8(%2) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %2 \n\t"
+ " add %3, %0 \n\t"
+ " add %4, %1 \n\t"
+ ".endr \n\t"
+
+ : "+r" (FiltPtr),
+ "+r" (ReconPtr),
+ "+r" (DctInputPtr)
+ : "m" (PixelsPerLine),
+ "m" (ReconPixelsPerLine)
+ : "memory"
+ );
+}
+
+static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+ " movq "M(V128)", %%mm1 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up
conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
+ " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
+ " movq %%mm0, (%1) \n\t" /* write answer out */
+ " movq %%mm2, 8(%1) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %1 \n\t"
+ " add %2, %0 \n\t"
+ ".endr \n\t"
+
+ : "+r" (FiltPtr),
+ "+r" (DctInputPtr)
+ : "m" (PixelsPerLine)
+ : "memory"
+ );
+}
+
+static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
+ " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up
conversion */
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up
conversion */
+ " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up
conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1)
*/
+ " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2)
*/
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1)
*/
+ " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2)
*/
+ /* average ReconPtr1 and ReconPtr2 */
+ " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 +
ReconPtr2 */
+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 +
ReconPtr2 */
+ " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 +
ReconPtr2) / 2 */
+ " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 +
ReconPtr2) / 2 */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr -
((ReconPtr1 + ReconPtr2) / 2) */
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr -
((ReconPtr1 + ReconPtr2) / 2) */
+ " movq %%mm0, (%3) \n\t" /* write answer out */
+ " movq %%mm2, 8(%3) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %3 \n\t"
+ " add %4, %0 \n\t"
+ " add %5, %1 \n\t"
+ " add %5, %2 \n\t"
+ ".endr \n\t"
+
+ : "+r" (FiltPtr),
+ "+r" (ReconPtr1),
+ "+r" (ReconPtr2),
+ "+r" (DctInputPtr)
+ : "m" (PixelsPerLine),
+ "m" (ReconPixelsPerLine)
+ : "memory"
+ );
+}
+
+static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
+{
+ ogg_uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack
*/
+ " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack
*/
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four
bytes to higher precision */
+ " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four
bytes to higher precision */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+ " psrlq $32, %%mm2 \n\t" /* fold and add */
+ " psrlq $32, %%mm3 \n\t"
+ " paddw %%mm2, %%mm0 \n\t"
+ " paddw %%mm3, %%mm1 \n\t"
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+ " psrlq $16, %%mm2 \n\t"
+ " psrlq $16, %%mm3 \n\t"
+ " paddw %%mm2, %%mm0 \n\t"
+ " paddw %%mm3, %%mm1 \n\t"
+
+ " psubusw %%mm0, %%mm1 \n\t"
+ " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
+ " movd %%mm1, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+
+ : "=m" (MaxSad),
+ "+r" (Src1),
+ "+r" (Src2)
+ :
+ : "memory"
+ );
+ return MaxSad;
+}
+
+static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
+ ogg_uint32_t stride)
+{
+ ogg_uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack
*/
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "1: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher
precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference...
*/
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes
to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference...
*/
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " add %3, %2 \n\t" /* Inc pointer into the new
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "2: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher
precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference...
*/
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes
to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " add %3, %2 \n\t" /* Inc pointer into the new
data */
+
+ " dec %%edi \n\t"
+ " jnz 2b \n\t"
+
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
+ " psubusw %%mm4, %%mm5 \n\t"
+ " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
+ " psubusw %%mm5, %%mm7 \n\t"
+ " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $32, %%mm6 \n\t"
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $16, %%mm6 \n\t"
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+
+ : "=r" (MaxSad),
+ "+r" (Src1),
+ "+r" (Src2)
+ : "r" (stride)
+ : "memory", "edi"
+ );
+
+ return MaxSad;
+}
+
+static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack
*/
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result
*/
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " movq %%mm0, %%mm2 \n\t"
+
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher
precision for accumulation */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference...
*/
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes
to higher precision */
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %4, %2 \n\t" /* Inc pointer into ref
data */
+ ".endr \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $16, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+
+ : "=m" (DiffVal),
+ "+r" (ptr1),
+ "+r" (ptr2)
+ : "r" (stride1),
+ "r" (stride2)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t
stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
+{
+ return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5
*/
+ " paddb %%mm5, %%mm5 \n\t"
+
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack
*/
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result
*/
+ " mov $8, %%edi \n\t" /* 8 rows */
+ "1: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+
+ " movq (%2), %%mm2 \n\t"
+ " movq (%3), %%mm3 \n\t" /* take average of mm2 and
mm3 */
+ " movq %%mm2, %%mm1 \n\t"
+ " pand %%mm3, %%mm1 \n\t"
+ " pxor %%mm2, %%mm3 \n\t"
+ " pand %%mm5, %%mm3 \n\t"
+ " psrlq $1, %%mm3 \n\t"
+ " paddb %%mm3, %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher
precision for accumulation */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference...
*/
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes
to higher precision */
+ " add %4, %1 \n\t" /* Inc pointer into the new
data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %5, %2 \n\t" /* Inc pointer into ref
data */
+ " add %5, %3 \n\t" /* Inc pointer into ref
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $16, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+
+ : "=m" (DiffVal),
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
+ : "m" (SrcStride),
+ "m" (RefStride)
+ : "edi", "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t
Stride)
+{
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq %%mm0, %%mm2 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %3, %2 \n\t" /* Inc pointer into src
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+
+ : "=r" (XSum),
+ "=r" (XXSum),
+ "+r" (DataPtr)
+ : "r" (Stride)
+ : "edi", "memory"
+ );
+
+ /* Compute population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ) );
+}
+
+static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%3), %%mm1 \n\t"
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpcklbw %%mm6, %%mm1 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+ " punpckhbw %%mm6, %%mm3 \n\t"
+
+ " psubsw %%mm1, %%mm0 \n\t"
+ " psubsw %%mm3, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %4, %2 \n\t" /* Inc pointer into src
data */
+ " add %5, %3 \n\t" /* Inc pointer into ref
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+
+ : "=m" (XSum),
+ "=m" (XXSum),
+ "+r" (SrcData),
+ "+r" (RefDataPtr)
+ : "m" (SrcStride),
+ "m" (RefStride)
+ : "edi", "memory"
+ );
+
+ /* Compute and return population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4
*/
+ " paddb %%mm4, %%mm4 \n\t"
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+
+ " movq (%3), %%mm2 \n\t"
+ " movq (%4), %%mm3 \n\t" /* take average of mm2 and
mm3 */
+ " movq %%mm2, %%mm1 \n\t"
+ " pand %%mm3, %%mm1 \n\t"
+ " pxor %%mm2, %%mm3 \n\t"
+ " pand %%mm4, %%mm3 \n\t"
+ " psrlq $1, %%mm3 \n\t"
+ " paddb %%mm3, %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpcklbw %%mm6, %%mm1 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+ " punpckhbw %%mm6, %%mm3 \n\t"
+
+ " psubsw %%mm1, %%mm0 \n\t"
+ " psubsw %%mm3, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %5, %2 \n\t" /* Inc pointer into src
data */
+ " add %6, %3 \n\t" /* Inc pointer into ref
data */
+ " add %6, %4 \n\t" /* Inc pointer into ref
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+
+ : "=m" (XSum),
+ "=m" (XXSum),
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
+ : "m" (SrcStride),
+ "m" (RefStride)
+ : "edi", "memory"
+ );
+
+ /* Compute and return population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static void restore_fpu (void)
+{
+ __asm__ __volatile__ (
+ " emms \n\t"
+ );
+}
+
+void dsp_i386_mmx_init(DspFunctions *funcs)
+{
+ funcs->restore_fpu = restore_fpu;
+ funcs->sub8x8 = sub8x8__mmx;
+ funcs->sub8x8_128 = sub8x8_128__mmx;
+ funcs->sub8x8avg2 = sub8x8avg2__mmx;
+ funcs->row_sad8 = row_sad8__mmx;
+ funcs->col_sad8x8 = col_sad8x8__mmx;
+ funcs->sad8x8 = sad8x8__mmx;
+ funcs->sad8x8_thres = sad8x8_thres__mmx;
+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
+ funcs->intra8x8_err = intra8x8_err__mmx;
+ funcs->inter8x8_err = inter8x8_err__mmx;
+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
+}
+
Index: lib/i386/recon_mmx.c
==================================================================---
lib/i386/recon_mmx.c (revision 0)
+++ lib/i386/recon_mmx.c (revision 0)
@@ -0,0 +1,185 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include "encoder_internal.h"
+
+static const __attribute__ ((aligned(8))) ogg_int64_t V128 =
0x8080808080808080LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+static void copy8x8__mmx (unsigned char *src,
+ unsigned char *dest,
+ unsigned int stride)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " lea (%2, %2, 2), %%edi \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1, %2), %%mm1 \n\t"
+ " movq (%1, %2, 2), %%mm2 \n\t"
+ " movq (%1, %%edi), %%mm3 \n\t"
+
+ " lea (%1, %2, 4), %1 \n\t"
+
+ " movq %%mm0, (%0) \n\t"
+ " movq %%mm1, (%0, %2) \n\t"
+ " movq %%mm2, (%0, %2, 2) \n\t"
+ " movq %%mm3, (%0, %%edi) \n\t"
+
+ " lea (%0, %2, 4), %0 \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1, %2), %%mm1 \n\t"
+ " movq (%1, %2, 2), %%mm2 \n\t"
+ " movq (%1, %%edi), %%mm3 \n\t"
+
+ " movq %%mm0, (%0) \n\t"
+ " movq %%mm1, (%0, %2) \n\t"
+ " movq %%mm2, (%0, %2, 2) \n\t"
+ " movq %%mm3, (%0, %%edi) \n\t"
+ : "+a" (dest)
+ : "c" (src),
+ "d" (stride)
+ : "memory", "edi"
+ );
+}
+
+static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t
*ChangePtr,
+ ogg_uint32_t LineStep)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to
0x8080808080808080 */
+
+ " lea 128(%1), %%edi \n\t" /* Endpoint in input
buffer */
+ "1: \n\t"
+ " movq (%1), %%mm2 \n\t" /* First four input
values */
+
+ " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high)
four values */
+ " por %%mm0, %%mm0 \n\t"
+ " pxor %%mm0, %%mm2 \n\t" /* Convert result to
unsigned (same as add 128) */
+ " lea 16(%1), %1 \n\t" /* Step source buffer */
+ " cmp %%edi, %1 \n\t" /* are we done */
+
+ " movq %%mm2, (%0) \n\t" /* store results */
+
+ " lea (%0, %2), %0 \n\t" /* Step output buffer */
+ " jc 1b \n\t" /* Loop back if we are
not done */
+ : "+r" (ReconPtr)
+ : "r" (ChangePtr),
+ "r" (LineStep)
+ : "memory", "edi"
+ );
+}
+
+static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char
*RefPtr,
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm0, %%mm0 \n\t"
+ " lea 128(%1), %%edi \n\t"
+
+ "1: \n\t"
+ " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8
reference pixels */
+
+ " movq (%1), %%mm4 \n\t" /* first 4 changes */
+ " movq %%mm2, %%mm3 \n\t"
+ " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
+ " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs
into positive 16-bit #s */
+ " paddsw %%mm4, %%mm2 \n\t" /* add in first 4
changes */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into
positive 16-bit #s */
+ " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes
*/
+ " add %3, %2 \n\t" /* next row of reference
pixels */
+ " packuswb %%mm3, %%mm2 \n\t" /* pack result to
unsigned 8-bit values */
+ " lea 16(%1), %1 \n\t" /* next row of changes
*/
+ " cmp %%edi, %1 \n\t" /* are we done? */
+
+ " movq %%mm2, (%0) \n\t" /* store result */
+
+ " lea (%0, %3), %0 \n\t" /* next row of output */
+ " jc 1b \n\t"
+ : "+r" (ReconPtr)
+ : "r" (ChangePtr),
+ "r" (RefPtr),
+ "r" (LineStep)
+ : "memory", "edi"
+ );
+}
+
+static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char
*RefPtr1,
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm0, %%mm0 \n\t"
+ " lea 128(%1), %%edi \n\t"
+
+ "1: \n\t"
+ " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8
reference pixels */
+ " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8
reference pixels */
+
+ " movq %%mm2, %%mm3 \n\t"
+ " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as
positive 16-bit #s */
+ " movq %%mm4, %%mm5 \n\t"
+ " movq (%1), %%mm6 \n\t" /* first 4 changes */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as
positive 16-bit #s */
+ " movq 8(%1), %%mm7 \n\t" /* last 4 changes */
+ " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as
positive 16-bit #s */
+ " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as
positive 16-bit #s */
+ " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 +
ref2) */
+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 +
ref2) */
+ " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 +
ref2)/2 */
+ " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 +
ref2)/2 */
+ " paddw %%mm6, %%mm2 \n\t" /* add changes to start
*/
+ " paddw %%mm7, %%mm3 \n\t" /* add changes to end */
+ " lea 16(%1), %1 \n\t" /* next row of changes
*/
+ " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to
unsigned 8-bit */
+ " add %4, %2 \n\t" /* next row of reference
pixels */
+ " add %4, %3 \n\t" /* next row of reference
pixels */
+ " movq %%mm2, (%0) \n\t" /* store result */
+ " add %4, %0 \n\t" /* next row of output */
+ " cmp %%edi, %1 \n\t" /* are we done? */
+ " jc 1b \n\t"
+ : "+r" (ReconPtr)
+ : "r" (ChangePtr),
+ "r" (RefPtr1),
+ "r" (RefPtr2),
+ "m" (LineStep)
+ : "memory", "edi"
+ );
+}
+
+void dsp_i386_mmx_recon_init(DspFunctions *funcs)
+{
+ funcs->copy8x8 = copy8x8__mmx;
+ funcs->recon_intra8x8 = recon_intra8x8__mmx;
+ funcs->recon_inter8x8 = recon_inter8x8__mmx;
+ funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
+}
+
Index: lib/i386/dsp_mmxext.c
==================================================================---
lib/i386/dsp_mmxext.c (revision 0)
+++ lib/i386/dsp_mmxext.c (revision 0)
@@ -0,0 +1,316 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include "dsp.h"
+
+static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result
*/
+
+ ".rept 7 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %4, %2 \n\t" /* Inc pointer into ref
data */
+ ".endr \n\t"
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference...
*/
+ " movd %%mm7, %0 \n\t"
+
+ : "=r" (DiffVal),
+ "+r" (ptr1),
+ "+r" (ptr2)
+ : "r" (stride1),
+ "r" (stride2)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t
stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result
*/
+
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %4, %2 \n\t" /* Inc pointer into ref
data */
+ ".endr \n\t"
+
+ " movd %%mm7, %0 \n\t"
+
+ : "=r" (DiffVal),
+ "+r" (ptr1),
+ "+r" (ptr2)
+ : "r" (stride1),
+ "r" (stride2)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData,
ogg_uint32_t SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result
*/
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " movq (%3), %%mm2 \n\t"
+ " pavgb %%mm2, %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+
+ " add %4, %1 \n\t" /* Inc pointer into the new
data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %5, %2 \n\t" /* Inc pointer into ref
data */
+ " add %5, %3 \n\t" /* Inc pointer into ref
data */
+ ".endr \n\t"
+
+ " movd %%mm7, %0 \n\t"
+ : "=m" (DiffVal),
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
+ : "m" (SrcStride),
+ "m" (RefStride)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
+{
+ ogg_uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " movd (%1), %%mm0 \n\t"
+ " movd (%2), %%mm1 \n\t"
+ " psadbw %%mm0, %%mm1 \n\t"
+ " movd 4(%1), %%mm2 \n\t"
+ " movd 4(%2), %%mm3 \n\t"
+ " psadbw %%mm2, %%mm3 \n\t"
+
+ " pmaxsw %%mm1, %%mm3 \n\t"
+ " movd %%mm3, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+
+ : "=m" (MaxSad),
+ "+r" (Src1),
+ "+r" (Src2)
+ :
+ : "memory"
+ );
+
+ return MaxSad;
+}
+
+static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char
*Src2,
+ ogg_uint32_t stride)
+{
+ ogg_uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack
*/
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "1: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher
precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference...
*/
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes
to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference...
*/
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " add %3, %2 \n\t" /* Inc pointer into the new
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "2: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs
difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher
precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference...
*/
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes
to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference...
*/
+ " add %3, %1 \n\t" /* Inc pointer into the new
data */
+ " add %3, %2 \n\t" /* Inc pointer into the new
data */
+
+ " dec %%edi \n\t"
+ " jnz 2b \n\t"
+
+ " pmaxsw %%mm6, %%mm7 \n\t"
+ " pmaxsw %%mm4, %%mm5 \n\t"
+ " pmaxsw %%mm5, %%mm7 \n\t"
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $32, %%mm6 \n\t"
+ " pmaxsw %%mm6, %%mm7 \n\t"
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $16, %%mm6 \n\t"
+ " pmaxsw %%mm6, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+
+ : "=r" (MaxSad),
+ "+r" (Src1),
+ "+r" (Src2)
+ : "r" (stride)
+ : "memory", "edi"
+ );
+
+ return MaxSad;
+}
+
+static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData,
ogg_uint32_t SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm4, %%mm4 \n\t"
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+
+ " movq (%3), %%mm2 \n\t"
+ " movq (%4), %%mm1 \n\t" /* take average of mm2 and
mm1 */
+ " pavgb %%mm2, %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpcklbw %%mm4, %%mm1 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+ " punpckhbw %%mm4, %%mm3 \n\t"
+
+ " psubsw %%mm1, %%mm0 \n\t"
+ " psubsw %%mm3, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %5, %2 \n\t" /* Inc pointer into src
data */
+ " add %6, %3 \n\t" /* Inc pointer into ref
data */
+ " add %6, %4 \n\t" /* Inc pointer into ref
data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+
+ : "=m" (XSum),
+ "=m" (XXSum),
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
+ : "m" (SrcStride),
+ "m" (RefStride)
+ : "edi", "memory"
+ );
+
+ /* Compute and return population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ));
+}
+
+void dsp_i386_mmxext_init(DspFunctions *funcs)
+{
+ funcs->row_sad8 = row_sad8__mmxext;
+ funcs->col_sad8x8 = col_sad8x8__mmxext;
+ funcs->sad8x8 = sad8x8__mmxext;
+ funcs->sad8x8_thres = sad8x8_thres__mmxext;
+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
+}
+
Index: lib/dct.c
==================================================================--- lib/dct.c
(revision 7621)
+++ lib/dct.c (working copy)
@@ -16,6 +16,7 @@
********************************************************************/
#include "encoder_internal.h"
+#include "cpu.h"
static ogg_int32_t xC1S7 = 64277;
static ogg_int32_t xC2S6 = 60547;
@@ -28,7 +29,7 @@
#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
-void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
+static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData
){
int loop;
ogg_int32_t is07, is12, is34, is56;
@@ -251,3 +252,12 @@
op ++;
}
}
+
+void dsp_dct_init (DspFunctions *funcs)
+{
+ funcs->fdct_short = fdct_short__c;
+ if (cpu_flags & CPU_X86_MMX) {
+ dsp_i386_mmx_fdct_init(&dsp_funcs);
+ }
+}
+
Index: lib/mcomp.c
==================================================================---
lib/mcomp.c (revision 7621)
+++ lib/mcomp.c (working copy)
@@ -17,6 +17,7 @@
#include <stdlib.h>
#include <stdio.h>
+#include "dsp.h"
#include "encoder_internal.h"
/* Initialises motion compentsation. */
@@ -100,164 +101,25 @@
unsigned char * RefDataPtr1,
unsigned char * RefDataPtr2,
ogg_uint32_t PixelsPerLine ) {
- ogg_uint32_t i;
- ogg_int32_t XSum=0;
- ogg_int32_t XXSum=0;
ogg_int32_t DiffVal;
- ogg_int32_t AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));
+ ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
+ ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
/* Mode of interpolation chosen based upon on the offset of the
second reference pointer */
- if ( AbsRefOffset == 0 ) {
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];
- XSum += DiffVal;
-
- /* negative array indexes are strictly forbidden by ANSI C and C99 */
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
- }
-
+ if ( RefOffset == 0 ) {
+ DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine,
+ RefDataPtr1, RefPixelsPerLine);
}else{
-
- /* Simple two reference interpolation */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal = ((int)NewDataPtr[0]) -
- (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[1]) -
- (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[2]) -
- (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[3]) -
- (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[4]) -
- (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[5]) -
- (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[6]) -
- (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[7]) -
- (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
- RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
- }
+ DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine,
+ RefDataPtr1,
+ RefDataPtr2, RefPixelsPerLine);
}
/* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t GetSumAbsDiffs (unsigned char * NewDataPtr,
- unsigned char * RefDataPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ErrorSoFar) {
- ogg_uint32_t i;
- ogg_uint32_t DiffVal = ErrorSoFar;
-
- /* Decide on standard or MMX implementation */
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
- DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
- DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
- DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
- DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
- DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
- DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
- DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
- }
-
return DiffVal;
}
-static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
- unsigned char * RefDataPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ErrorSoFar,
- ogg_uint32_t BestSoFar ) {
- ogg_uint32_t i;
- ogg_uint32_t DiffVal = ErrorSoFar;
-
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
- DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
- DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
- DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
- DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
- DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
- DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
- DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
- if ( DiffVal > BestSoFar )break;
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
- }
-
- return DiffVal;
-}
-
static ogg_uint32_t GetHalfPixelSumAbsDiffs (unsigned char * SrcData,
unsigned char * RefDataPtr1,
unsigned char * RefDataPtr2,
@@ -265,119 +127,61 @@
ogg_uint32_t ErrorSoFar,
ogg_uint32_t BestSoFar ) {
- ogg_uint32_t i;
ogg_uint32_t DiffVal = ErrorSoFar;
ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
if ( RefOffset == 0 ) {
/* Simple case as for non 0.5 pixel */
- DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,
- ErrorSoFar);
+ DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine,
+ RefDataPtr1, RefPixelsPerLine);
} else {
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +
- (int)RefDataPtr2[0]) / 2) );
- DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
- (int)RefDataPtr2[1]) / 2) );
- DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
- (int)RefDataPtr2[2]) / 2) );
- DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
- (int)RefDataPtr2[3]) / 2) );
- DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
- (int)RefDataPtr2[4]) / 2) );
- DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
- (int)RefDataPtr2[5]) / 2) );
- DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
- (int)RefDataPtr2[6]) / 2) );
- DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
- (int)RefDataPtr2[7]) / 2) );
-
- if ( DiffVal > BestSoFar ) break;
-
- /* Step to next row of block. */
- SrcData += PixelsPerLine;
- RefDataPtr1 += RefPixelsPerLine;
- RefDataPtr2 += RefPixelsPerLine;
- }
+ DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine,
+ RefDataPtr1,
+ RefDataPtr2, RefPixelsPerLine, BestSoFar);
}
return DiffVal;
}
-static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
- ogg_uint32_t PixelsPerLine ) {
- ogg_uint32_t i;
- ogg_uint32_t XSum=0;
- ogg_uint32_t XXSum=0;
- unsigned char *DiffPtr;
-
- /* Loop expanded out for speed. */
- DiffPtr = DataPtr;
-
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-
- /* Examine alternate pixel locations. */
- XSum += DiffPtr[0];
- XXSum += DiffPtr[0]*DiffPtr[0];
- XSum += DiffPtr[1];
- XXSum += DiffPtr[1]*DiffPtr[1];
- XSum += DiffPtr[2];
- XXSum += DiffPtr[2]*DiffPtr[2];
- XSum += DiffPtr[3];
- XXSum += DiffPtr[3]*DiffPtr[3];
- XSum += DiffPtr[4];
- XXSum += DiffPtr[4]*DiffPtr[4];
- XSum += DiffPtr[5];
- XXSum += DiffPtr[5]*DiffPtr[5];
- XSum += DiffPtr[6];
- XXSum += DiffPtr[6]*DiffPtr[6];
- XSum += DiffPtr[7];
- XXSum += DiffPtr[7]*DiffPtr[7];
-
- /* Step to next row of block. */
- DiffPtr += PixelsPerLine;
- }
-
- /* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-}
-
ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
ogg_uint32_t PixelsPerLine ) {
ogg_uint32_t LocalFragIndex = FragIndex;
ogg_uint32_t IntraError = 0;
+ dsp_static_save_fpu ();
+
/* Add together the intra errors for those blocks in the macro block
that are coded (Y only) */
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +- GetIntraError(&cpi->
+ dsp_static_intra8x8_err (&cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
-
LocalFragIndex++;
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +- GetIntraError(&cpi->
+ dsp_static_intra8x8_err (&cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
LocalFragIndex = FragIndex + cpi->pb.HFragments;
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +- GetIntraError(&cpi->
+ dsp_static_intra8x8_err (&cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
LocalFragIndex++;
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +- GetIntraError(&cpi->
+ dsp_static_intra8x8_err (&cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
+ dsp_static_restore_fpu ();
+
return IntraError;
}
@@ -400,6 +204,8 @@
unsigned char * SrcPtr1;
unsigned char * RefPtr1;
+ dsp_static_save_fpu ();
+
/* Work out pixel offset into source buffer. */
PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
@@ -462,6 +268,9 @@
InterError += GetInterErr( SrcPtr1, RefPtr1,
&RefPtr1[RefPtr2Offset], PixelsPerLine );
}
+
+ dsp_static_restore_fpu ();
+
return InterError;
}
@@ -496,6 +305,8 @@
unsigned char * RefDataPtr1;
unsigned char * RefDataPtr2;
+ dsp_static_save_fpu ();
+
/* Note which of the four blocks in the macro block are to be
included in the search. */
MBlockDispFrags[0] @@ -518,20 +329,20 @@
/* Check the 0,0 candidate. */
if ( MBlockDispFrags[0] ) {
- Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[1] ) {
- Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[2] ) {
- Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr +
RefRow2Offset,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[3] ) {
- Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr +
RefRow2Offset + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
/* Set starting values to results of 0, 0 vector. */
@@ -554,24 +365,23 @@
/* Get the score for the current offset */
if ( MBlockDispFrags[0] ) {
- Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine,
CandidateBlockPtr,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[1] && (Error < MinError) ) {
- Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
- PixelsPerLine, Error, MinError );
+ Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine,
CandidateBlockPtr + 8,
+ PixelsPerLine + STRIDE_EXTRA, MinError);
}
if ( MBlockDispFrags[2] && (Error < MinError) ) {
- Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr +
RefRow2Offset,
- PixelsPerLine, Error, MinError );
+ Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine,
CandidateBlockPtr + RefRow2Offset,
+ PixelsPerLine + STRIDE_EXTRA, MinError);
}
if ( MBlockDispFrags[3] && (Error < MinError) ) {
- Error = GetNextSumAbsDiffs( SrcPtr[3],
- CandidateBlockPtr + RefRow2Offset + 8,
- PixelsPerLine, Error, MinError );
+ Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine,
CandidateBlockPtr + RefRow2Offset + 8,
+ PixelsPerLine + STRIDE_EXTRA, MinError);
}
if ( Error < MinError ) {
@@ -652,6 +462,8 @@
InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
FragIndex, MV->x, MV->y, PixelsPerLine
);
+ dsp_static_restore_fpu ();
+
/* Return score of best matching block. */
return InterMVError;
}
@@ -684,6 +496,8 @@
unsigned char * RefDataPtr1;
unsigned char * RefDataPtr2;
+ dsp_static_save_fpu ();
+
/* Note which of the four blocks in the macro block are to be
included in the search. */
MBlockDispFrags[0] = cpi->
@@ -717,20 +531,20 @@
/* Summ errors for each block. */
if ( MBlockDispFrags[0] ) {
- Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine,
CandidateBlockPtr,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[1] ){
- Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr
+ 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[2] ){
- Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr
+ RefRow2Offset,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[3] ){
- Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset +
8,
- PixelsPerLine, Error);
+ Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr
+ RefRow2Offset + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
/* Was this the best so far */
@@ -808,6 +622,8 @@
InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
FragIndex, MV->x, MV->y, PixelsPerLine
);
+ dsp_static_restore_fpu ();
+
/* Return score of best matching block. */
return InterMVError;
}
@@ -850,8 +666,8 @@
for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
/* Get the block error score. */
- Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,
- PixelsPerLine, 0);
+ Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr,
+ PixelsPerLine + STRIDE_EXTRA);
/* Was this the best so far */
if ( Error < MinError ) {
@@ -911,6 +727,8 @@
MOTION_VECTOR *MV ) {
ogg_uint32_t InterMVError;
+ dsp_static_save_fpu ();
+
/* For the moment the 4MV mode is only deemd to be valid if all four
Y blocks are to be updated */
/* This May be adapted later. */
@@ -941,6 +759,8 @@
InterMVError = HUGE_ERROR;
}
+ dsp_static_restore_fpu ();
+
/* Return score of best matching block. */
return InterMVError;
}
Index: lib/dct_decode.c
==================================================================---
lib/dct_decode.c (revision 7621)
+++ lib/dct_decode.c (working copy)
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include <string.h>
#include "encoder_internal.h"
+#include "dsp.h"
#define GOLDEN_FRAME_THRESH_Q 50
@@ -112,22 +113,6 @@
SetupBoundingValueArray_Generic(pbi, FLimit);
}
-void CopyBlock(unsigned char *src,
- unsigned char *dest,
- unsigned int srcstride){
- unsigned char *s = src;
- unsigned char *d = dest;
- unsigned int stride = srcstride;
-
- int j;
- for ( j = 0; j < 8; j++ ){
- ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
- ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
- s+=stride;
- d+=stride;
- }
-}
-
static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
ogg_uint32_t ReconPixelsPerLine;
ogg_int32_t ReconPixelIndex;
@@ -160,9 +145,8 @@
ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
/* Get the pixel index for the first pixel in the fragment. */
- ReconIntra( pbi, (unsigned char
*)(&pbi->ThisFrameRecon[ReconPixelIndex]),
- (ogg_int16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );
-
+ dsp_static_recon_intra8x8 ((unsigned char
*)(&pbi->ThisFrameRecon[ReconPixelIndex]),
+ (ogg_uint16_t *)pbi->ReconDataBuffer,
ReconPixelsPerLine);
}
static void ExpandBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
@@ -237,10 +221,9 @@
/* Reconstruct the pixel data using the last frame reconstruction
and change data when the motion vector is (0,0), the recon is
based on the lastframe without loop filtering---- for testing */
- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
&pbi->LastFrameRecon[ReconPixelIndex],
- pbi->ReconDataBuffer, ReconPixelsPerLine );
-
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
}else if ( ModeUsesMC[pbi->CodingMode] ) {
/* The mode uses a motion vector. */
/* Get vector from list */
@@ -287,29 +270,30 @@
if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
/* Reconstruct the pixel dats from the reference frame and change data
(no half pixel in this case as the two references were the same. */
- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_static_recon_inter8x8 (
+ &pbi->ThisFrameRecon[ReconPixelIndex],
LastFrameRecPtr, pbi->ReconDataBuffer,
- ReconPixelsPerLine );
+ ReconPixelsPerLine);
}else{
/* Fractional pixel reconstruction. */
/* Note that we only use two pixels per reconstruction even for
the diagonal. */
- ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],
+
dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex],
LastFrameRecPtr, LastFrameRecPtr2,
- pbi->ReconDataBuffer, ReconPixelsPerLine );
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
}
} else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
/* Golden frame with motion vector */
/* Reconstruct the pixel data using the golden frame
reconstruction and change data */
- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
&pbi->GoldenFrame[ ReconPixelIndex ],
- pbi->ReconDataBuffer, ReconPixelsPerLine );
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
} else {
/* Simple Intra coding */
/* Get the pixel index for the first pixel in the fragment. */
- ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
- pbi->ReconDataBuffer, ReconPixelsPerLine );
+ dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
}
}
@@ -464,7 +448,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
}
}
@@ -476,7 +460,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
}
}
@@ -501,7 +485,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
}
}
@@ -513,7 +497,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
}
}
Index: lib/pp.c
==================================================================--- lib/pp.c
(revision 7621)
+++ lib/pp.c (working copy)
@@ -19,6 +19,7 @@
#include <string.h>
#include "encoder_internal.h"
#include "pp.h"
+#include "dsp.h"
#define MAX(a, b) ((a>b)?a:b)
#define MIN(a, b) ((a<b)?a:b)
@@ -490,7 +491,7 @@
} else {
- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+ dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
}
@@ -529,7 +530,7 @@
DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
LineLength,Quality,QuantScale);
}else{
- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+ dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
}
++Block;
@@ -565,7 +566,7 @@
DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
LineLength,Quality,QuantScale);
}else{
- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+ dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
}
++Block;
Index: lib/encoder_internal.h
==================================================================---
lib/encoder_internal.h (revision 7621)
+++ lib/encoder_internal.h (working copy)
@@ -24,6 +24,7 @@
#include <theora/theora.h>
#include "huffman.h"
+#include "dsp.h"
#ifndef LIBOGG2
#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
@@ -689,23 +690,9 @@
ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData );
-extern void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep );
+extern void dsp_recon_init (DspFunctions *funcs);
-extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- unsigned char * RefPtr, ogg_int16_t * ChangePtr,
- ogg_uint32_t LineStep ) ;
-
-extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- unsigned char * RefPtr1,
- unsigned char * RefPtr2,
- ogg_int16_t * ChangePtr,
- ogg_uint32_t LineStep ) ;
-
extern void SetupLoopFilter(PB_INSTANCE *pbi);
-extern void CopyBlock(unsigned char *src,
- unsigned char *dest,
- unsigned int srcstride);
extern void LoopFilter(PB_INSTANCE *pbi);
extern void ReconRefFrames (PB_INSTANCE *pbi);
extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,
Index: lib/scan.c
==================================================================--- lib/scan.c
(revision 7621)
+++ lib/scan.c (working copy)
@@ -19,9 +19,20 @@
#include <math.h>
#include <string.h>
#include "encoder_internal.h"
+#include "dsp.h"
#define MAX_SEARCH_LINE_LEN 7
+#define SET8_0(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
+ ((ogg_uint32_t *)ptr)[1] = 0x00000000;
+#define SET8_1(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
+ ((ogg_uint32_t *)ptr)[1] = 0x01010101;
+#define SET8_8(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
+ ((ogg_uint32_t *)ptr)[1] = 0x08080808;
+
static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
0, 0, 0, 0, 2, 4, 12, 24
};
@@ -384,69 +395,6 @@
ppi->KFIndicator =
((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
}
-static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
- unsigned char * Src2 ){
- ogg_uint32_t SadValue;
- ogg_uint32_t SadValue1;
-
- SadValue = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
- abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
-
- SadValue1 = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
- abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
-
- SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
- return SadValue;
-}
-
-static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
- unsigned char * Src1,
- unsigned char * Src2 ){
- ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
- ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
- ogg_uint32_t MaxSad = 0;
- ogg_uint32_t i;
-
- for ( i = 0; i < 4; i++ ){
- SadValue[0] += abs(Src1[0] - Src2[0]);
- SadValue[1] += abs(Src1[1] - Src2[1]);
- SadValue[2] += abs(Src1[2] - Src2[2]);
- SadValue[3] += abs(Src1[3] - Src2[3]);
- SadValue[4] += abs(Src1[4] - Src2[4]);
- SadValue[5] += abs(Src1[5] - Src2[5]);
- SadValue[6] += abs(Src1[6] - Src2[6]);
- SadValue[7] += abs(Src1[7] - Src2[7]);
-
- Src1 += ppi->PlaneStride;
- Src2 += ppi->PlaneStride;
- }
-
- for ( i = 0; i < 4; i++ ){
- SadValue2[0] += abs(Src1[0] - Src2[0]);
- SadValue2[1] += abs(Src1[1] - Src2[1]);
- SadValue2[2] += abs(Src1[2] - Src2[2]);
- SadValue2[3] += abs(Src1[3] - Src2[3]);
- SadValue2[4] += abs(Src1[4] - Src2[4]);
- SadValue2[5] += abs(Src1[5] - Src2[5]);
- SadValue2[6] += abs(Src1[6] - Src2[6]);
- SadValue2[7] += abs(Src1[7] - Src2[7]);
-
- Src1 += ppi->PlaneStride;
- Src2 += ppi->PlaneStride;
- }
-
- for ( i = 0; i < 8; i++ ){
- if ( SadValue[i] > MaxSad )
- MaxSad = SadValue[i];
- if ( SadValue2[i] > MaxSad )
- MaxSad = SadValue2[i];
- }
-
- return MaxSad;
-}
-
-
static int RowSadScan( PP_INSTANCE *ppi,
unsigned char * YuvPtr1,
unsigned char * YuvPtr2,
@@ -475,7 +423,7 @@
for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
/* Calculate the SAD score for the block row */
- GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);
+ GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2);
/* Now test the group SAD score */
if ( GrpSad > LocalGrpLowSadThresh ){
@@ -532,7 +480,7 @@
/* Skip if block already marked to be coded. */
if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
/* Calculate the SAD score for the block column */
- MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );
+ MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2,
ppi->PlaneStride );
/* Now test the group SAD score */
if ( MaxSad > LocalGrpLowSadThresh ){
@@ -758,7 +706,7 @@
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
@@ -777,10 +725,10 @@
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
@@ -816,7 +764,7 @@
/* Test for break out conditions to save time. */
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
@@ -839,10 +787,10 @@
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
@@ -876,7 +824,7 @@
/* Test for break out conditions to save time. */
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
@@ -899,10 +847,10 @@
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
@@ -935,7 +883,7 @@
/* Test for break out conditions to save time. */
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
@@ -959,10 +907,10 @@
}else{
/* If we are breaking out here mark all pixels as changed.*/
if ( *DispFragPtr > BLOCK_NOT_CODED ) {
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
/* If we have a lot of changed pixels for this fragment on this
@@ -1071,7 +1019,7 @@
}
}else{
if ( *DispFragPtr > BLOCK_NOT_CODED )
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
/* Step pointers */
ChLocalsPtr += HFRAGPIXELS;
@@ -1133,7 +1081,7 @@
}
}else{
if ( *DispFragPtr > BLOCK_NOT_CODED )
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
/* Step pointers */
ChLocalsPtr += HFRAGPIXELS;
@@ -2126,10 +2074,12 @@
/* Fast break out test for obvious yes and no cases in this row of
blocks */
if ( i < ppi->PlaneVFragments ){
+ dsp_static_save_fpu ();
UpdatedOrCandidateBlocks RowSadScan( ppi, RawPlanePtr0,
RawPlanePtr1, DispFragPtr0 );
- if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )
- UpdatedOrCandidateBlocks = 1;
+ UpdatedOrCandidateBlocks |+ ColSadScan( ppi, RawPlanePtr0,
RawPlanePtr1, DispFragPtr0 );
+ dsp_static_restore_fpu ();
}else{
/* Make sure we still call other functions if RowSadScan() disabled */
UpdatedOrCandidateBlocks = 1;
Index: lib/dsp.c
==================================================================--- lib/dsp.c
(revision 0)
+++ lib/dsp.c (revision 0)
@@ -0,0 +1,416 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include "cpu.h"
+#include "encoder_internal.h"
+
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+
+DspFunctions dsp_funcs;
+
+static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr,
+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine) {
+ int i;
+
+ /* For each block row */
+ for (i=8; i; i--) {
+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
+
+ /* Start next row */
+ FiltPtr += PixelsPerLine;
+ ReconPtr += ReconPixelsPerLine;
+ DctInputPtr += 8;
+ }
+}
+
+static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine) {
+ int i;
+ /* For each block row */
+ for (i=8; i; i--) {
+ /* INTRA mode so code raw image data */
+ /* We convert the data to 8 bit signed (by subtracting 128) as
+ this reduces the internal precision requirments in the DCT
+ transform. */
+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
+
+ /* Start next row */
+ FiltPtr += PixelsPerLine;
+ DctInputPtr += 8;
+ }
+}
+
+static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine)
+{
+ int i;
+
+ /* For each block row */
+ for (i=8; i; i--) {
+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG
(ReconPtr1[0], ReconPtr2[0]));
+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG
(ReconPtr1[1], ReconPtr2[1]));
+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG
(ReconPtr1[2], ReconPtr2[2]));
+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG
(ReconPtr1[3], ReconPtr2[3]));
+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG
(ReconPtr1[4], ReconPtr2[4]));
+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG
(ReconPtr1[5], ReconPtr2[5]));
+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG
(ReconPtr1[6], ReconPtr2[6]));
+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG
(ReconPtr1[7], ReconPtr2[7]));
+
+ /* Start next row */
+ FiltPtr += PixelsPerLine;
+ ReconPtr1 += ReconPixelsPerLine;
+ ReconPtr2 += ReconPixelsPerLine;
+ DctInputPtr += 8;
+ }
+}
+
+static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2)
+{
+ ogg_uint32_t SadValue;
+ ogg_uint32_t SadValue1;
+
+ SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
+ DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+ DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+ DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+
+ SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
+ DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+ DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+ DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+
+ SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
+
+ return SadValue;
+}
+
+static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2,
+ ogg_uint32_t stride)
+{
+ ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
+ ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
+ ogg_uint32_t MaxSad = 0;
+ ogg_uint32_t i;
+
+ for ( i = 0; i < 4; i++ ){
+ SadValue[0] += abs(Src1[0] - Src2[0]);
+ SadValue[1] += abs(Src1[1] - Src2[1]);
+ SadValue[2] += abs(Src1[2] - Src2[2]);
+ SadValue[3] += abs(Src1[3] - Src2[3]);
+ SadValue[4] += abs(Src1[4] - Src2[4]);
+ SadValue[5] += abs(Src1[5] - Src2[5]);
+ SadValue[6] += abs(Src1[6] - Src2[6]);
+ SadValue[7] += abs(Src1[7] - Src2[7]);
+
+ Src1 += stride;
+ Src2 += stride;
+ }
+
+ for ( i = 0; i < 4; i++ ){
+ SadValue2[0] += abs(Src1[0] - Src2[0]);
+ SadValue2[1] += abs(Src1[1] - Src2[1]);
+ SadValue2[2] += abs(Src1[2] - Src2[2]);
+ SadValue2[3] += abs(Src1[3] - Src2[3]);
+ SadValue2[4] += abs(Src1[4] - Src2[4]);
+ SadValue2[5] += abs(Src1[5] - Src2[5]);
+ SadValue2[6] += abs(Src1[6] - Src2[6]);
+ SadValue2[7] += abs(Src1[7] - Src2[7]);
+
+ Src1 += stride;
+ Src2 += stride;
+ }
+
+ for ( i = 0; i < 8; i++ ){
+ if ( SadValue[i] > MaxSad )
+ MaxSad = SadValue[i];
+ if ( SadValue2[i] > MaxSad )
+ MaxSad = SadValue2[i];
+ }
+
+ return MaxSad;
+}
+
+static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2)
+{
+ ogg_uint32_t i;
+ ogg_uint32_t sad = 0;
+
+ for (i=8; i; i--) {
+ sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+ sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+ sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+ sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+ sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+ sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+ sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+ sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+ /* Step to next row of block. */
+ ptr1 += stride1;
+ ptr2 += stride2;
+ }
+
+ return sad;
+}
+
+static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
+{
+ ogg_uint32_t i;
+ ogg_uint32_t sad = 0;
+
+ for (i=8; i; i--) {
+ sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+ sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+ sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+ sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+ sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+ sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+ sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+ sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+ if (sad > thres )
+ break;
+
+ /* Step to next row of block. */
+ ptr1 += stride1;
+ ptr2 += stride2;
+ }
+
+ return sad;
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
+{
+ ogg_uint32_t i;
+ ogg_uint32_t sad = 0;
+
+ for (i=8; i; i--) {
+ sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0],
RefDataPtr2[0]));
+ sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1],
RefDataPtr2[1]));
+ sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2],
RefDataPtr2[2]));
+ sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3],
RefDataPtr2[3]));
+ sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4],
RefDataPtr2[4]));
+ sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5],
RefDataPtr2[5]));
+ sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6],
RefDataPtr2[6]));
+ sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7],
RefDataPtr2[7]));
+
+ if ( sad > thres )
+ break;
+
+ /* Step to next row of block. */
+ SrcData += SrcStride;
+ RefDataPtr1 += RefStride;
+ RefDataPtr2 += RefStride;
+ }
+
+ return sad;
+}
+
+static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t
Stride)
+{
+ ogg_uint32_t i;
+ ogg_uint32_t XSum=0;
+ ogg_uint32_t XXSum=0;
+
+ for (i=8; i; i--) {
+ /* Examine alternate pixel locations. */
+ XSum += DataPtr[0];
+ XXSum += DataPtr[0]*DataPtr[0];
+ XSum += DataPtr[1];
+ XXSum += DataPtr[1]*DataPtr[1];
+ XSum += DataPtr[2];
+ XXSum += DataPtr[2]*DataPtr[2];
+ XSum += DataPtr[3];
+ XXSum += DataPtr[3]*DataPtr[3];
+ XSum += DataPtr[4];
+ XXSum += DataPtr[4]*DataPtr[4];
+ XSum += DataPtr[5];
+ XXSum += DataPtr[5]*DataPtr[5];
+ XSum += DataPtr[6];
+ XXSum += DataPtr[6]*DataPtr[6];
+ XSum += DataPtr[7];
+ XXSum += DataPtr[7]*DataPtr[7];
+
+ /* Step to next row of block. */
+ DataPtr += Stride;
+ }
+
+ /* Compute population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ) );
+}
+
+static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+ ogg_uint32_t i;
+ ogg_uint32_t XSum=0;
+ ogg_uint32_t XXSum=0;
+ ogg_int32_t DiffVal;
+
+ for (i=8; i; i--) {
+ DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ /* Step to next row of block. */
+ SrcData += SrcStride;
+ RefDataPtr += RefStride;
+ }
+
+ /* Compute and return population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+ ogg_uint32_t i;
+ ogg_uint32_t XSum=0;
+ ogg_uint32_t XXSum=0;
+ ogg_int32_t DiffVal;
+
+ for (i=8; i; i--) {
+ DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0],
RefDataPtr2[0]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1],
RefDataPtr2[1]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2],
RefDataPtr2[2]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3],
RefDataPtr2[3]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4],
RefDataPtr2[4]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5],
RefDataPtr2[5]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6],
RefDataPtr2[6]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7],
RefDataPtr2[7]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ /* Step to next row of block. */
+ SrcData += SrcStride;
+ RefDataPtr1 += RefStride;
+ RefDataPtr2 += RefStride;
+ }
+
+ /* Compute and return population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static void nop (void) { /* NOP */ }
+
+void dsp_init(DspFunctions *funcs)
+{
+ funcs->save_fpu = nop;
+ funcs->restore_fpu = nop;
+ funcs->sub8x8 = sub8x8__c;
+ funcs->sub8x8_128 = sub8x8_128__c;
+ funcs->sub8x8avg2 = sub8x8avg2__c;
+ funcs->row_sad8 = row_sad8__c;
+ funcs->col_sad8x8 = col_sad8x8__c;
+ funcs->sad8x8 = sad8x8__c;
+ funcs->sad8x8_thres = sad8x8_thres__c;
+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
+ funcs->intra8x8_err = intra8x8_err__c;
+ funcs->inter8x8_err = inter8x8_err__c;
+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
+}
+
+void dsp_static_init(void)
+{
+ cpu_init ();
+ dsp_init (&dsp_funcs);
+ dsp_recon_init (&dsp_funcs);
+ dsp_dct_init (&dsp_funcs);
+ if (cpu_flags & CPU_X86_MMX) {
+ dsp_i386_mmx_init(&dsp_funcs);
+ }
+ if (cpu_flags & CPU_X86_MMXEXT) {
+ dsp_i386_mmxext_init(&dsp_funcs);
+ }
+}
+
Index: lib/Makefile.am
==================================================================---
lib/Makefile.am (revision 7621)
+++ lib/Makefile.am (working copy)
@@ -3,12 +3,13 @@
lib_LTLIBRARIES = libtheora.la
if THEORA_SUPPORT_ENCODE
-encoder_sources = dct_encode.c encode.c encoder_toplevel.c
+encoder_sources = dct_encode.c encode.c encoder_toplevel.c
else
encoder_sources = encoder_disabled.c
endif
libtheora_la_SOURCES = \
+ cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c i386/recon_mmx.c
i386/fdct_mmx.c \
blockmap.c \
comment.c \
dct.c \
Index: lib/blockmap.c
==================================================================---
lib/blockmap.c (revision 7621)
+++ lib/blockmap.c (working copy)
@@ -21,7 +21,7 @@
ogg_uint32_t FirstSB,
ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,
ogg_uint32_t VFrags ){
- ogg_uint32_t i, j;
+ ogg_uint32_t i, j = 0;
ogg_uint32_t xpos;
ogg_uint32_t ypos;
ogg_uint32_t SBrow, SBcol;
Index: lib/encoder_toplevel.c
==================================================================---
lib/encoder_toplevel.c (revision 7621)
+++ lib/encoder_toplevel.c (working copy)
@@ -777,6 +777,8 @@
CP_INSTANCE *cpi;
+ dsp_static_init ();
+
memset(th, 0, sizeof(*th));
th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
Index: lib/toplevel.c
==================================================================---
lib/toplevel.c (revision 7621)
+++ lib/toplevel.c (working copy)
@@ -290,6 +290,8 @@
PB_INSTANCE *pbi;
codec_setup_info *ci;
+ dsp_static_init ();
+
ci=(codec_setup_info *)c->codec_setup;
th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));
Index: lib/dsp.h
==================================================================--- lib/dsp.h
(revision 0)
+++ lib/dsp.h (revision 0)
@@ -0,0 +1,154 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#ifndef DSP_H
+#define DSP_H
+
+#include <theora/theora.h>
+
+typedef struct
+{
+ void (*save_fpu) (void);
+ void (*restore_fpu) (void);
+
+ void (*sub8x8) (unsigned char *FiltPtr, unsigned char *ReconPtr,
+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine);
+
+ void (*sub8x8_128) (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine);
+
+ void (*sub8x8avg2) (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine);
+
+ void (*copy8x8) (unsigned char *src, unsigned char *dest,
+ ogg_uint32_t stride);
+
+ void (*recon_intra8x8) (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep);
+
+ void (*recon_inter8x8) (unsigned char *ReconPtr, unsigned char *RefPtr,
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
+
+ void (*recon_inter8x8_half) (unsigned char *ReconPtr, unsigned char
*RefPtr1,
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep);
+
+ void (*fdct_short) (ogg_int16_t *InputData, ogg_int16_t
*OutputData);
+
+ ogg_uint32_t (*row_sad8) (unsigned char *Src1, unsigned char *Src2);
+
+ ogg_uint32_t (*col_sad8x8) (unsigned char *Src1, unsigned char *Src2,
+ ogg_uint32_t stride);
+
+ ogg_uint32_t (*sad8x8) (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2);
+
+ ogg_uint32_t (*sad8x8_thres) (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres);
+
+ ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres);
+
+ ogg_uint32_t (*intra8x8_err) (unsigned char *DataPtr, ogg_uint32_t Stride);
+
+ ogg_uint32_t (*inter8x8_err) (unsigned char *SrcData, ogg_uint32_t SrcStride,
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride);
+
+ ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t
SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
+} DspFunctions;
+
+extern DspFunctions dsp_funcs;
+
+extern void dsp_recon_init (DspFunctions *funcs);
+
+void dsp_init(DspFunctions *funcs);
+void dsp_static_init(void);
+
+#define dsp_save_fpu(funcs) (funcs.save_fpu ())
+#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)
+
+#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
+#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs)
+
+#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5))
+#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5)
+
+#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
+#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3)
+
+#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2
(a1,a2,a3,a4,a5,a6))
+#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6)
dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6)
+
+#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
+#define dsp_static_copy8x8(ptr1,ptr2,str1)
dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1)
+
+#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8
(ptr1,ptr2,str1))
+#define dsp_static_recon_intra8x8(ptr1,ptr2,str1)
dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1)
+
+#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \
+ (funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1))
+#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \
+ dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1)
+
+#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \
+ (funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1))
+#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \
+ dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1)
+
+#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
+#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out)
+
+#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
+#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2)
+
+#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8
(ptr1,ptr2,str1))
+#define dsp_static_col_sad8x8(ptr1,ptr2,str1)
dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1)
+
+#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8
(ptr1,str1,ptr2,str2))
+#define dsp_static_sad8x8(ptr1,str1,ptr2,str2)
dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2)
+
+#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres
(ptr1,str1,ptr2,str2,t))
+#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t)
dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t)
+
+#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \
+ (funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t))
+#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \
+ dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t)
+
+#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
+#define dsp_static_intra8x8_err(ptr1,str1)
dsp_intra8x8_err(dsp_funcs,ptr1,str1)
+
+#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \
+ (funcs.inter8x8_err (ptr1,str1,ptr2,str2))
+#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \
+ dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2)
+
+#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
+ (funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
+#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \
+ dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2)
+
+
+#endif /* DSP_H */
Index: lib/encode.c
==================================================================---
lib/encode.c (revision 7621)
+++ lib/encode.c (working copy)
@@ -531,8 +531,7 @@
static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
ogg_int32_t BlockIndex ) {
- ogg_uint32_t i;
- ogg_uint32_t ErrorVal = 0;
+ ogg_uint32_t ErrorVal;
unsigned char * SrcDataPtr
&cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
@@ -550,21 +549,8 @@
RecStride = cpi->pb.UVStride;
}
+ ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride);
- /* Decide on standard or MMX implementation */
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
- ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
- ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
- ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
- ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
- ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
- ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
- ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
- /* Step to next row of block. */
- SrcDataPtr += SrcStride;
- RecDataPtr += RecStride;
- }
return ErrorVal;
}
@@ -933,9 +919,13 @@
/* Zero Decoder EOB run count */
cpi->pb.EOB_Run = 0;
+ dsp_static_save_fpu ();
+
/* Encode any fragments coded using DCT. */
coded_pixels += QuadCodeDisplayFragments (cpi);
+ dsp_static_restore_fpu ();
+
return coded_pixels;
}