thr3ads.net - opus - [opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86. [Mar 2015]

If this information is useful, please help other people find it:
Share via:

Viswanath Puttagunta

2015-Mar-12 17:45 UTC

[opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.

From: Jonathan Lennox <jonathan at vidyo.com>

* Makes ?enable-intrinsics work with clang and other non-GCC compilers
* Enables RTCD for the floating-point-mode SSE code in Celt.
* Disables use of RTCD in cases where the compiler targets an instruction set by
default.
* Enables the SSE4.1 Silk optimizations that apply to the common parts of Silk
when Opus is built in floating-point mode, not just in fixed-point mode.
* Enables the SSE intrinsics (with RTCD when appropriate) in the Win32 build.
* Fixes a case where GCC would compile SSE2 code as SSE4.1, causing a crash on
non-SSE4.1 CPUs.
* Allows configuration with compilers with non-GCC-flavor flags for enabling
architecture options.
* Hopefully makes the configuration and ifdef?s easier to follow and understand.

Reviewed-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org>
---
 Makefile.am                              |  38 ++--
 celt/arm/armcpu.c                        |   6 +-
 celt/arm/pitch_arm.h                     |   4 +-
 celt/bands.c                             |   6 +-
 celt/celt.c                              |  16 +-
 celt/celt.h                              |  12 +-
 celt/celt_decoder.c                      |   6 +-
 celt/celt_encoder.c                      |   4 +-
 celt/celt_lpc.h                          |   2 +-
 celt/cpu_support.h                       |  15 +-
 celt/mips/celt_mipsr1.h                  |   2 +-
 celt/pitch.c                             |   4 +-
 celt/pitch.h                             |  19 +-
 celt/tests/test_unit_dft.c               |   4 +-
 celt/tests/test_unit_mathops.c           |  11 +-
 celt/tests/test_unit_mdct.c              |   4 +-
 celt/tests/test_unit_rotation.c          |  11 +-
 celt/x86/celt_lpc_sse.c                  |   4 +
 celt/x86/celt_lpc_sse.h                  |  12 +-
 celt/x86/pitch_sse.c                     | 334 +++++++++++++------------------
 celt/x86/pitch_sse.h                     | 256 ++++++++++-------------
 celt/x86/pitch_sse2.c                    |  95 +++++++++
 celt/x86/pitch_sse4_1.c                  | 195 ++++++++++++++++++
 celt/x86/x86_celt_map.c                  |  76 ++++++-
 celt/x86/x86cpu.c                        |  47 ++++-
 celt/x86/x86cpu.h                        |  26 ++-
 celt_sources.mk                          |   5 +-
 configure.ac                             | 312 ++++++++++++++++++-----------
 m4/opus-intrinsics.m4                    |  29 +++
 silk/x86/SigProc_FIX_sse.h               |  17 ++
 silk/x86/main_sse.h                      |  48 +++++
 silk/x86/x86_silk_map.c                  |  25 ++-
 win32/VS2010/celt.vcxproj                |  17 +-
 win32/VS2010/celt.vcxproj.filters        |  27 +++
 win32/VS2010/silk_common.vcxproj         |  17 +-
 win32/VS2010/silk_common.vcxproj.filters |  23 ++-
 win32/VS2010/silk_fixed.vcxproj          |  13 +-
 win32/VS2010/silk_fixed.vcxproj.filters  |  17 +-
 win32/config.h                           |  25 ++-
 39 files changed, 1210 insertions(+), 574 deletions(-)
 create mode 100644 celt/x86/pitch_sse2.c
 create mode 100644 celt/x86/pitch_sse4_1.c
 create mode 100644 m4/opus-intrinsics.m4

diff --git a/Makefile.am b/Makefile.am
index c5c1562..3a75740 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -23,6 +23,9 @@ SILK_SOURCES += $(SILK_SOURCES_SSE4_1)
$(SILK_SOURCES_FIXED_SSE4_1)
 endif
 else
 SILK_SOURCES += $(SILK_SOURCES_FLOAT)
+if HAVE_SSE4_1
+SILK_SOURCES += $(SILK_SOURCES_SSE4_1)
+endif
 endif
 
 if DISABLE_FLOAT_API
@@ -30,12 +33,14 @@ else
 OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
 endif
 
-if HAVE_SSE4_1
-CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1)
-else
-if HAVE_SSE2
+if HAVE_SSE
 CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
+if HAVE_SSE2
+CELT_SOURCES += $(CELT_SOURCES_SSE2)
+endif
+if HAVE_SSE4_1
+CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
 endif
 
 if CPU_ARM
@@ -44,7 +49,6 @@ SILK_SOURCES += $(SILK_SOURCES_ARM)
 
 if OPUS_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
-OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon
 endif
 
 if HAVE_ARM_NE10
@@ -262,20 +266,30 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S):
$(top_srcdir)/celt/arm/arm2gnu.pl
 %-gnu.S: %.s
 	$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
 
-SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o
+OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_rotation_SOURCES:.c=.o)
+
+if HAVE_SSE
+SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
+$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
+endif
 
-if HAVE_SSE4_1
-$(SSE_OBJ): CFLAGS += -msse4.1
-else
 if HAVE_SSE2
-$(SSE_OBJ): CFLAGS += -msse2
+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
 endif
+
+if HAVE_SSE4_1
+SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
+             $(SILK_SOURCES_SSE4_1:.c=.lo) \
+             $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
+$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
 endif
 
 if OPUS_ARM_NEON_INTR
 CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
                          $(CELT_SOURCES_ARM_NE10:.c=.lo) \
-                         %test_unit_rotation.o %test_unit_mathops.o \
                          %test_unit_mdct.o %test_unit_dft.o
-$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS)
$(NE10_CFLAGS)
+
+$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS)
 endif
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index 1768525..5e5d10c 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -73,7 +73,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
   }
-#   if defined(OPUS_ARM_MAY_HAVE_NEON)
+#   if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
   __try{
     /*VORR q0,q0,q0*/
     __emit(0xF2200150);
@@ -107,7 +107,7 @@ opus_uint32 opus_cpu_capabilities(void)
 
     while(fgets(buf, 512, cpuinfo) != NULL)
     {
-# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) ||
defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
       /* Search for edsp and neon flag */
       if(memcmp(buf, "Features", 8) == 0)
       {
@@ -118,7 +118,7 @@ opus_uint32 opus_cpu_capabilities(void)
           flags |= OPUS_CPU_ARM_EDSP;
 #  endif
 
-#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+#  if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_NEON;
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index 125d1bc..8626ed7 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -54,10 +54,10 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const
opus_val16 *_y,
 
 #else /* Start !FIXED_POINT */
 /* Float case */
-#if defined(OPUS_ARM_NEON_INTR)
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
                                  opus_val32 *xcorr, int len, int max_pitch);
-#if !defined(OPUS_HAVE_RTCD)
+#if !defined(OPUS_HAVE_RTCD) || defined(OPUS_ARM_PRESUME_NEON_INTR)
 #define OVERRIDE_PITCH_XCORR (1)
 #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
    ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
diff --git a/celt/bands.c b/celt/bands.c
index c643b09..25f229e 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -398,7 +398,7 @@ static void stereo_split(celt_norm * OPUS_RESTRICT X,
celt_norm * OPUS_RESTRICT
    }
 }
 
-static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT
Y, opus_val16 mid, int N)
+static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT
Y, opus_val16 mid, int N, int arch)
 {
    int j;
    opus_val32 xp=0, side=0;
@@ -410,7 +410,7 @@ static void stereo_merge(celt_norm * OPUS_RESTRICT X,
celt_norm * OPUS_RESTRICT
    opus_val32 t, lgain, rgain;
 
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   dual_inner_prod(Y, X, Y, N, &xp, &side);
+   dual_inner_prod(Y, X, Y, N, &xp, &side, arch);
    /* Compensating for the mid normalization */
    xp = MULT16_32_Q15(mid, xp);
    /* mid and side are in Q15, not Q14 like X and Y */
@@ -1348,7 +1348,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx,
celt_norm *X, celt_norm
    if (resynth)
    {
       if (N!=2)
-         stereo_merge(X, Y, mid, N);
+         stereo_merge(X, Y, mid, N, ctx->arch);
       if (inv)
       {
          int j;
diff --git a/celt/celt.c b/celt/celt.c
index a610de4..40c62ce 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -89,10 +89,12 @@ int resampling_factor(opus_int32 rate)
    return ret;
 }
 
-#ifndef OVERRIDE_COMB_FILTER_CONST
 /* This version should be faster on ARM */
 #ifdef OPUS_ARM_ASM
-static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+#ifndef NON_STATIC_COMB_FILTER_CONST_C
+static
+#endif
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
       opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
    opus_val32 x0, x1, x2, x3, x4;
@@ -147,7 +149,10 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x,
int T, int N,
 #endif
 }
 #else
-static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+#ifndef NON_STATIC_COMB_FILTER_CONST_C
+static
+#endif
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
       opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
    opus_val32 x0, x1, x2, x3, x4;
@@ -171,12 +176,11 @@ static void comb_filter_const(opus_val32 *y, opus_val32
*x, int T, int N,
 
 }
 #endif
-#endif
 
 #ifndef OVERRIDE_comb_filter
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap)
+      const opus_val16 *window, int overlap, int arch)
 {
    int i;
    /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
@@ -234,7 +238,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int
T1, int N,
    }
 
    /* Compute the part with the constant filter. */
-   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch);
 }
 #endif /* OVERRIDE_comb_filter */
 
diff --git a/celt/celt.h b/celt/celt.h
index b196751..a423b95 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -201,7 +201,17 @@ void celt_preemphasis(const opus_val16 * OPUS_RESTRICT
pcmp, celt_sig * OPUS_RES
 
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap);
+      const opus_val16 *window, int overlap, int arch);
+
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch)		\
+    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
+#endif
 
 void init_caps(const CELTMode *m,int *cap,int LM,int C);
 
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 304f334..505a6ef 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -699,7 +699,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st,
int N, int LM)
          comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
               st->postfilter_period, st->postfilter_period, overlap,
               -st->postfilter_gain, -st->postfilter_gain,
-              st->postfilter_tapset, st->postfilter_tapset, NULL, 0);
+              st->postfilter_tapset, st->postfilter_tapset, NULL, 0,
st->arch);
 
          /* Simulate TDAC on the concealed audio so that it blends with the
             MDCT of the next frame. */
@@ -1011,11 +1011,11 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st,
const unsigned char *dat
       st->postfilter_period_old=IMAX(st->postfilter_period_old,
COMBFILTER_MINPERIOD);
       comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old,
st->postfilter_period, mode->shortMdctSize,
             st->postfilter_gain_old, st->postfilter_gain,
st->postfilter_tapset_old, st->postfilter_tapset,
-            mode->window, overlap);
+            mode->window, overlap, st->arch);
       if (LM!=0)
          comb_filter(out_syn[c]+mode->shortMdctSize,
out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch,
N-mode->shortMdctSize,
                st->postfilter_gain, postfilter_gain,
st->postfilter_tapset, postfilter_tapset,
-               mode->window, overlap);
+               mode->window, overlap, st->arch);
 
    } while (++c<CC);
    st->postfilter_period_old = st->postfilter_period;
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 5f48638..1c9dbcb 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1166,11 +1166,11 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in,
celt_sig *prefilter_mem,
       if (offset)
          comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD,
                st->prefilter_period, st->prefilter_period, offset,
-st->prefilter_gain, -st->prefilter_gain,
-               st->prefilter_tapset, st->prefilter_tapset, NULL, 0);
+               st->prefilter_tapset, st->prefilter_tapset, NULL, 0,
st->arch);
 
       comb_filter(in+c*(N+overlap)+overlap+offset,
pre[c]+COMBFILTER_MAXPERIOD+offset,
             st->prefilter_period, pitch_index, N-offset,
-st->prefilter_gain, -gain1,
-            st->prefilter_tapset, prefilter_tapset, mode->window,
overlap);
+            st->prefilter_tapset, prefilter_tapset, mode->window,
overlap, st->arch);
       OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap);
 
       if (N>COMBFILTER_MAXPERIOD)
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index dc8967f..323459e 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -48,7 +48,7 @@ void celt_fir_c(
          opus_val16 *mem,
          int arch);
 
-#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if !defined(OVERRIDE_CELT_FIR)
 #define celt_fir(x, num, y, N, ord, mem, arch) \
     (celt_fir_c(x, num, y, N, ord, mem, arch))
 #endif
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 1d62e2f..5e99a90 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -32,7 +32,8 @@
 #include "opus_defines.h"
 
 #if defined(OPUS_HAVE_RTCD) && \
-  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
+  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+
 #include "arm/armcpu.h"
 
 /* We currently support 4 ARM variants:
@@ -43,14 +44,16 @@
  */
 #define OPUS_ARCHMASK 3
 
-#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#elif (defined(OPUS_X86_MAY_HAVE_SSE) &&
!defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
|| \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1))
 
 #include "x86/x86cpu.h"
-/* We currently support 3 x86 variants:
+/* We currently support 4 x86 variants:
  * arch[0] -> non-sse
- * arch[1] -> sse2
- * arch[2] -> sse4.1
- * arch[3] -> NULL
+ * arch[1] -> sse
+ * arch[2] -> sse2
+ * arch[3] -> sse4.1
  */
 #define OPUS_ARCHMASK 3
 int opus_select_arch(void);
diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h
index 03915d8..7915d59 100644
--- a/celt/mips/celt_mipsr1.h
+++ b/celt/mips/celt_mipsr1.h
@@ -56,7 +56,7 @@
 #define OVERRIDE_comb_filter
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap)
+      const opus_val16 *window, int overlap, int arch)
 {
    int i;
    opus_val32 x0, x1, x2, x3, x4;
diff --git a/celt/pitch.c b/celt/pitch.c
index 4364703..1d89cb0 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -439,7 +439,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int
minperiod,
 
    T = T0 = *T0_;
    ALLOC(yy_lookup, maxperiod+1, opus_val32);
-   dual_inner_prod(x, x, x-T0, N, &xx, &xy);
+   dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch);
    yy_lookup[0] = xx;
    yy=xx;
    for (i=1;i<=maxperiod;i++)
@@ -483,7 +483,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int
minperiod,
       {
          T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);
       }
-      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
+      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2,
arch);
       xy += xy2;
       yy = yy_lookup[T1] + yy_lookup[T1b];
 #ifdef FIXED_POINT
diff --git a/celt/pitch.h b/celt/pitch.h
index 4368cc5..af745eb 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -37,8 +37,8 @@
 #include "modes.h"
 #include "cpu_support.h"
 
-#if defined(__SSE__) && !defined(FIXED_POINT) \
- || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \
+  || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
&& defined(FIXED_POINT))
 #include "x86/pitch_sse.h"
 #endif
 
@@ -135,8 +135,7 @@ static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x,
const opus_val16 *
 #endif /* OVERRIDE_XCORR_KERNEL */
 
 
-#ifndef OVERRIDE_DUAL_INNER_PROD
-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16
*y01, const opus_val16 *y02,
+static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16
*y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2)
 {
    int i;
@@ -150,6 +149,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16
*x, const opus_val16 *y
    *xy1 = xy01;
    *xy2 = xy02;
 }
+
+#ifndef OVERRIDE_DUAL_INNER_PROD
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+    ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2))
 #endif
 
 /*We make sure a C version is always available for cases where the overhead of
@@ -169,6 +172,12 @@ static OPUS_INLINE opus_val32 celt_inner_prod_c(const
opus_val16 *x,
     ((void)(arch),celt_inner_prod_c(x, y, N))
 #endif
 
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+     opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+
 #ifdef FIXED_POINT
 opus_val32
 #else
@@ -180,7 +189,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16
*_y,
 #if !defined(OVERRIDE_PITCH_XCORR)
 /*Is run-time CPU detection enabled on this platform?*/
 # if defined(OPUS_HAVE_RTCD) && \
-  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
+  (defined(OPUS_ARM_ASM) || (defined(OPUS_ARM_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)))
 extern
 #  if defined(FIXED_POINT)
 opus_val32
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 84f69bd..57691c6 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -50,7 +50,7 @@
 #include "entcode.c"
 
 #if defined(OPUS_HAVE_RTCD) && \
-         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
+         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 #include "arm/armcpu.c"
 #if defined(HAVE_ARM_NE10)
 #include "arm/celt_ne10_fft.c"
@@ -60,6 +60,8 @@
 #include "arm/arm_celt_map.c"
 #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
 #include "x86/x86cpu.c"
+#include "celt/x86/pitch_sse.c"
+#include "x86/x86_celt_map.c"
 #endif
 
 #ifndef M_PI
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 0f1e4f1..379fbd5 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -49,12 +49,21 @@
 #include "cwrs.c"
 #include "pitch.c"
 #include "celt_lpc.c"
+#include "celt.c"
 #include "kiss_fft.c"
 #include "mdct.c"
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#if defined(OPUS_X86_MAY_HAVE_SSE) || \
+    defined(OPUS_X86_MAY_HAVE_SSE2) || \
+    defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if defined(OPUS_X86_MAY_HAVE_SSE)
 #include "x86/pitch_sse.c"
+#endif
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/pitch_sse2.c"
+#endif
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/pitch_sse4_1.c"
 #include "x86/celt_lpc_sse.c"
 #endif
 #include "x86/x86_celt_map.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index c64cac2..d8c4ef0 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -49,7 +49,7 @@
 #include "entcode.c"
 
 #if defined(OPUS_HAVE_RTCD) && \
-         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
+         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 #include "arm/armcpu.c"
 #if defined(HAVE_ARM_NE10)
 #include "arm/celt_ne10_fft.c"
@@ -60,6 +60,8 @@
 
 #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
 #include "x86/x86cpu.c"
+#include "celt/x86/pitch_sse.c"
+#include "x86/x86_celt_map.c"
 #endif
 
 #ifndef M_PI
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index ce14936..3cf54fa 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -46,13 +46,22 @@
 #include "bands.h"
 #include "pitch.c"
 #include "celt_lpc.c"
+#include "celt.c"
 #include "kiss_fft.c"
 #include "mdct.c"
 #include <math.h>
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#if defined(OPUS_X86_MAY_HAVE_SSE) || \
+    defined(OPUS_X86_MAY_HAVE_SSE2) || \
+    defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if defined(OPUS_X86_MAY_HAVE_SSE)
 #include "x86/pitch_sse.c"
+#endif
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/pitch_sse2.c"
+#endif
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/pitch_sse4_1.c"
 #include "x86/celt_lpc_sse.c"
 #endif
 #include "x86/x86_celt_map.c"
diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
index 9fb9779..67e5592 100644
--- a/celt/x86/celt_lpc_sse.c
+++ b/celt/x86/celt_lpc_sse.c
@@ -38,6 +38,8 @@
 #include "pitch.h"
 #include "x86cpu.h"
 
+#if defined(FIXED_POINT)
+
 void celt_fir_sse4_1(const opus_val16 *_x,
          const opus_val16 *num,
          opus_val16 *_y,
@@ -126,3 +128,5 @@ void celt_fir_sse4_1(const opus_val16 *_x,
 #endif
    RESTORE_STACK;
 }
+
+#endif
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index f111420..c5ec796 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -32,7 +32,9 @@
 #include "config.h"
 #endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_FIR
+
 void celt_fir_sse4_1(
          const opus_val16 *x,
          const opus_val16 *num,
@@ -42,6 +44,12 @@ void celt_fir_sse4_1(
          opus_val16 *mem,
          int arch);
 
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define celt_fir(x, num, y, N, ord, mem, arch) \
+    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+
+#else
+
 extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
          const opus_val16 *num,
@@ -56,3 +64,5 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
 
 #endif
 #endif
+
+#endif
diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c
index e3bc6d7..20e7312 100644
--- a/celt/x86/pitch_sse.c
+++ b/celt/x86/pitch_sse.c
@@ -29,223 +29,157 @@
 #include "config.h"
 #endif
 
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
 #include "macros.h"
 #include "celt_lpc.h"
 #include "stack_alloc.h"
 #include "mathops.h"
 #include "pitch.h"
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
-#include <smmintrin.h>
-#include "x86cpu.h"
-
-opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
-      int N)
-{
-    opus_int  i, dataSize16;
-    opus_int32 sum;
-    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
-    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
-    __m128i inVec1_3210, inVec2_3210;
-
-    sum = 0;
-    dataSize16 = N & ~15;
-
-    acc1 = _mm_setzero_si128();
-    acc2 = _mm_setzero_si128();
-
-    for (i=0;i<dataSize16;i+=16) {
-        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
-        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
-
-        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
-        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
-
-        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
-        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
-
-        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
-        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
-    }
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 
-    acc1 = _mm_add_epi32(acc1, acc2);
-
-    if (N - i >= 8)
-    {
-        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
-        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
-
-        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
-
-        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
-        i += 8;
-    }
-
-    if (N - i >= 4)
-    {
-        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
-        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
-
-        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
-
-        acc1 = _mm_add_epi32(acc1, inVec1_3210);
-        i += 4;
-    }
-
-    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
-    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
-
-    sum += _mm_cvtsi128_si32(acc1);
-
-    for (;i<N;i++)
-    {
-        sum = silk_SMLABB(sum, x[i], y[i]);
-    }
+#include <xmmintrin.h>
+#include "arch.h"
 
-    return sum;
+void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32
sum[4], int len)
+{
+   int j;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_loadu_ps(sum);
+   xsum2 = _mm_setzero_ps();
+
+   for (j = 0; j < len-3; j += 4)
+   {
+      __m128 x0 = _mm_loadu_ps(x+j);
+      __m128 yj = _mm_loadu_ps(y+j);
+      __m128 y3 = _mm_loadu_ps(y+j+3);
+
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
+                                          _mm_shuffle_ps(yj,y3,0x49)));
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
+                                          _mm_shuffle_ps(yj,y3,0x9e)));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
+   }
+   if (j < len)
+   {
+      xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+      if (++j < len)
+      {
+         xsum2 =
_mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         if (++j < len)
+         {
+            xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         }
+      }
+   }
+   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
 }
 
-void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
sum[ 4 ], int len)
+
+void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const
opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
 {
-    int j;
-
-    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
-    __m128i vecY0, vecY1, vecY2, vecY3;
-    __m128i sum0, sum1, sum2, sum3, vecSum;
-    __m128i initSum;
-
-    celt_assert(len >= 3);
-
-    sum0 = _mm_setzero_si128();
-    sum1 = _mm_setzero_si128();
-    sum2 = _mm_setzero_si128();
-    sum3 = _mm_setzero_si128();
-
-    for (j=0;j<(len-7);j+=8)
-    {
-        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
-        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
-        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
-        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
-        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
-
-        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
-        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
-        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
-        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
-    }
-
-    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
-    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
-
-    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
-    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
-
-    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
-    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
-
-    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
-    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
-
-    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
-          _mm_unpacklo_epi32(sum2, sum3));
-
-    for (;j<(len-3);j+=4)
-    {
-        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
-        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
-        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
-        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
-        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
-
-        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
-        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
-        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
-        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
-
-        sum0 = _mm_mullo_epi32(vecX0, vecY0);
-        sum1 = _mm_mullo_epi32(vecX1, vecY1);
-        sum2 = _mm_mullo_epi32(vecX2, vecY2);
-        sum3 = _mm_mullo_epi32(vecX3, vecY3);
-
-        sum0 = _mm_add_epi32(sum0, sum1);
-        sum2 = _mm_add_epi32(sum2, sum3);
-        vecSum = _mm_add_epi32(vecSum, sum0);
-        vecSum = _mm_add_epi32(vecSum, sum2);
-    }
-
-    for (;j<len;j++)
-    {
-        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
-        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
-
-        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
-
-        sum0 = _mm_mullo_epi32(vecX0, vecY0);
-        vecSum = _mm_add_epi32(vecSum, sum0);
-    }
-
-    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
-    initSum = _mm_add_epi32(initSum, vecSum);
-    _mm_storeu_si128((__m128i *)sum, initSum);
+   int i;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_setzero_ps();
+   xsum2 = _mm_setzero_ps();
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 y1i = _mm_loadu_ps(y01+i);
+      __m128 y2i = _mm_loadu_ps(y02+i);
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
+   }
+   /* Horizontal sum */
+   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
+   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
+   _mm_store_ss(xy1, xsum1);
+   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+   _mm_store_ss(xy2, xsum2);
+   for (;i<N;i++)
+   {
+      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
+   }
 }
-#endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE2)
-opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
       int N)
 {
-    opus_int  i, dataSize16;
-    opus_int32 sum;
-
-    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
-    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
-
-    sum = 0;
-    dataSize16 = N & ~15;
-
-    acc1 = _mm_setzero_si128();
-    acc2 = _mm_setzero_si128();
-
-    for (i=0;i<dataSize16;i+=16)
-    {
-        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
-        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
-
-        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
-        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
-
-        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
-        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
-
-        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
-        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
-    }
-
-    acc1 = _mm_add_epi32( acc1, acc2 );
-
-    if (N - i >= 8)
-    {
-        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
-        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
-
-        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+   int i;
+   float xy;
+   __m128 sum;
+   sum = _mm_setzero_ps();
+   /* FIXME: We should probably go 8-way and use 2 sums. */
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 yi = _mm_loadu_ps(y+i);
+      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
+   }
+   /* Horizontal sum */
+   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+   _mm_store_ss(&xy, sum);
+   for (;i<N;i++)
+   {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
 
-        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
-        i += 8;
-    }
+void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   int i;
+   __m128 x0v;
+   __m128 g10v, g11v, g12v;
+   g10v = _mm_load1_ps(&g10);
+   g11v = _mm_load1_ps(&g11);
+   g12v = _mm_load1_ps(&g12);
+   x0v = _mm_loadu_ps(&x[-T-2]);
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 yi, yi2, x1v, x2v, x3v, x4v;
+      const opus_val32 *xp = &x[i-T-2];
+      yi = _mm_loadu_ps(x+i);
+      x4v = _mm_loadu_ps(xp+4);
+#if 0
+      /* Slower version with all loads */
+      x1v = _mm_loadu_ps(xp+1);
+      x2v = _mm_loadu_ps(xp+2);
+      x3v = _mm_loadu_ps(xp+3);
+#else
+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
 
-    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
-    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
-    sum += _mm_cvtsi128_si32(acc1);
+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+      /* Use partial sums */
+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+      yi = _mm_add_ps(yi, yi2);
+#endif
+      x0v=x4v;
+      _mm_storeu_ps(y+i, yi);
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x[i-T])
+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+   }
+#endif
+}
 
-    for (;i<N;i++) {
-        sum = silk_SMLABB(sum, x[i], y[i]);
-    }
 
-    return sum;
-}
 #endif
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index 99d1919..cbe722c 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -37,17 +37,37 @@
 #include "config.h"
 #endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
 void xcorr_kernel_sse4_1(
                     const opus_int16 *x,
                     const opus_int16 *y,
                     opus_val32       sum[4],
                     int              len);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+void xcorr_kernel_sse(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+#endif
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)arch, xcorr_kernel_sse(x, y, sum, len))
+
+#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) ||
(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
 
 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
-                    const opus_int16 *x,
-                    const opus_int16 *y,
+                    const opus_val16 *x,
+                    const opus_val16 *y,
                     opus_val32       sum[4],
                     int              len);
 
@@ -55,181 +75,115 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
 #define xcorr_kernel(x, y, sum, len, arch) \
     ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
 
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
 opus_val32 celt_inner_prod_sse4_1(
     const opus_int16 *x,
     const opus_int16 *y,
     int               N);
 #endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE2)
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
 opus_val32 celt_inner_prod_sse2(
     const opus_int16 *x,
     const opus_int16 *y,
     int               N);
 #endif
 
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse(
+    const opus_val16 *x,
+    const opus_val16 *y,
+    int               N);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+	((void)arch, celt_inner_prod_sse4_1(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) &&
!defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+	((void)arch, celt_inner_prod_sse2(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+	((void)arch, celt_inner_prod_sse(x, y, N))
+
+
+#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
&& defined(FIXED_POINT)) || \
+	(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
-                    const opus_int16 *x,
-                    const opus_int16 *y,
+                    const opus_val16 *x,
+                    const opus_val16 *y,
                     int               N);
 
 #define OVERRIDE_CELT_INNER_PROD
 #define celt_inner_prod(x, y, N, arch) \
     ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
-#else
 
-#include <xmmintrin.h>
-#include "arch.h"
+#endif
 
-#define OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16
*y, opus_val32 sum[4], int len)
-{
-   int j;
-   __m128 xsum1, xsum2;
-   xsum1 = _mm_loadu_ps(sum);
-   xsum2 = _mm_setzero_ps();
-
-   for (j = 0; j < len-3; j += 4)
-   {
-      __m128 x0 = _mm_loadu_ps(x+j);
-      __m128 yj = _mm_loadu_ps(y+j);
-      __m128 y3 = _mm_loadu_ps(y+j+3);
-
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
-                                          _mm_shuffle_ps(yj,y3,0x49)));
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
-                                          _mm_shuffle_ps(yj,y3,0x9e)));
-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
-   }
-   if (j < len)
-   {
-      xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
-      if (++j < len)
-      {
-         xsum2 =
_mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
-         if (++j < len)
-         {
-            xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
-         }
-      }
-   }
-   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
-}
-
-#define xcorr_kernel(_x, _y, _z, len, arch) \
-    ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 
 #define OVERRIDE_DUAL_INNER_PROD
-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16
*y01, const opus_val16 *y02,
-      int N, opus_val32 *xy1, opus_val32 *xy2)
-{
-   int i;
-   __m128 xsum1, xsum2;
-   xsum1 = _mm_setzero_ps();
-   xsum2 = _mm_setzero_ps();
-   for (i=0;i<N-3;i+=4)
-   {
-      __m128 xi = _mm_loadu_ps(x+i);
-      __m128 y1i = _mm_loadu_ps(y01+i);
-      __m128 y2i = _mm_loadu_ps(y02+i);
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
-   }
-   /* Horizontal sum */
-   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
-   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
-   _mm_store_ss(xy1, xsum1);
-   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
-   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
-   _mm_store_ss(xy2, xsum2);
-   for (;i<N;i++)
-   {
-      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
-      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
-   }
-}
+#define OVERRIDE_COMB_FILTER_CONST
 
-#define OVERRIDE_CELT_INNER_PROD
-static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const
opus_val16 *y,
-      int N)
-{
-   int i;
-   float xy;
-   __m128 sum;
-   sum = _mm_setzero_ps();
-   /* FIXME: We should probably go 8-way and use 2 sums. */
-   for (i=0;i<N-3;i+=4)
-   {
-      __m128 xi = _mm_loadu_ps(x+i);
-      __m128 yi = _mm_loadu_ps(y+i);
-      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
-   }
-   /* Horizontal sum */
-   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
-   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
-   _mm_store_ss(&xy, sum);
-   for (;i<N;i++)
-   {
-      xy = MAC16_16(xy, x[i], y[i]);
-   }
-   return xy;
-}
-
-#  define celt_inner_prod(_x, _y, len, arch) \
-    ((void)(arch),celt_inner_prod_sse(_x, _y, len))
+void dual_inner_prod_sse(const opus_val16 *x,
+	const opus_val16 *y01,
+	const opus_val16 *y02,
+	int               N,
+	opus_val32       *xy1,
+	opus_val32       *xy2);
+
+void comb_filter_const_sse(opus_val32 *y,
+	opus_val32 *x,
+	int         T,
+	int         N,
+	opus_val16  g10,
+	opus_val16  g11,
+	opus_val16  g12);
+
+
+#if defined(OPUS_X86_PRESUME_SSE)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+    ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
 
 #define OVERRIDE_COMB_FILTER_CONST
-static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T,
int N,
-      opus_val16 g10, opus_val16 g11, opus_val16 g12)
-{
-   int i;
-   __m128 x0v;
-   __m128 g10v, g11v, g12v;
-   g10v = _mm_load1_ps(&g10);
-   g11v = _mm_load1_ps(&g11);
-   g12v = _mm_load1_ps(&g12);
-   x0v = _mm_loadu_ps(&x[-T-2]);
-   for (i=0;i<N-3;i+=4)
-   {
-      __m128 yi, yi2, x1v, x2v, x3v, x4v;
-      const opus_val32 *xp = &x[i-T-2];
-      yi = _mm_loadu_ps(x+i);
-      x4v = _mm_loadu_ps(xp+4);
-#if 0
-      /* Slower version with all loads */
-      x1v = _mm_loadu_ps(xp+1);
-      x2v = _mm_loadu_ps(xp+2);
-      x3v = _mm_loadu_ps(xp+3);
-#else
-      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
-      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
-      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
-#endif
 
-      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
-#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
-      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
-      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
 #else
-      /* Use partial sums */
-      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
-                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
-      yi = _mm_add_ps(yi, yi2);
+
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+              const opus_val16 *x,
+              const opus_val16 *y01,
+              const opus_val16 *y02,
+              int               N,
+              opus_val32       *xy1,
+              opus_val32       *xy2);
+
+#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch)			\
+    ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1,
xy2))
+
+extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+              opus_val32 *y,
+              opus_val32 *x,
+              int         T,
+              int         N,
+              opus_val16  g10,
+              opus_val16  g11,
+              opus_val16  g12);
+
+#define comb_filter_const(y, x, T, N, g10, g11, g12, arch)				\
+    ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10,
g11, g12))
+
+#define NON_STATIC_COMB_FILTER_CONST_C
+
 #endif
-      x0v=x4v;
-      _mm_storeu_ps(y+i, yi);
-   }
-#ifdef CUSTOM_MODES
-   for (;i<N;i++)
-   {
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x[i-T])
-               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
-               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
-   }
 #endif
-}
 
 #endif
-#endif
diff --git a/celt/x86/pitch_sse2.c b/celt/x86/pitch_sse2.c
new file mode 100644
index 0000000..a0e7d1b
--- /dev/null
+++ b/celt/x86/pitch_sse2.c
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+    opus_int  i, dataSize16;
+    opus_int32 sum;
+
+    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+
+    sum = 0;
+    dataSize16 = N & ~15;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for (i=0;i<dataSize16;i+=16)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+    }
+
+    acc1 = _mm_add_epi32( acc1, acc2 );
+
+    if (N - i >= 8)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        i += 8;
+    }
+
+    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
+    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
+    sum += _mm_cvtsi128_si32(acc1);
+
+    for (;i<N;i++) {
+        sum = silk_SMLABB(sum, x[i], y[i]);
+    }
+
+    return sum;
+}
+#endif
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
new file mode 100644
index 0000000..a092c68
--- /dev/null
+++ b/celt/x86/pitch_sse4_1.c
@@ -0,0 +1,195 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#include <smmintrin.h>
+#include "x86cpu.h"
+
+opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+    opus_int  i, dataSize16;
+    opus_int32 sum;
+    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+    __m128i inVec1_3210, inVec2_3210;
+
+    sum = 0;
+    dataSize16 = N & ~15;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for (i=0;i<dataSize16;i+=16) {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+    }
+
+    acc1 = _mm_add_epi32(acc1, acc2);
+
+    if (N - i >= 8)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        i += 8;
+    }
+
+    if (N - i >= 4)
+    {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
+
+        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_3210);
+        i += 4;
+    }
+
+    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
+    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
+
+    sum += _mm_cvtsi128_si32(acc1);
+
+    for (;i<N;i++)
+    {
+        sum = silk_SMLABB(sum, x[i], y[i]);
+    }
+
+    return sum;
+}
+
+void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
sum[ 4 ], int len)
+{
+    int j;
+
+    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
+    __m128i vecY0, vecY1, vecY2, vecY3;
+    __m128i sum0, sum1, sum2, sum3, vecSum;
+    __m128i initSum;
+
+    celt_assert(len >= 3);
+
+    sum0 = _mm_setzero_si128();
+    sum1 = _mm_setzero_si128();
+    sum2 = _mm_setzero_si128();
+    sum3 = _mm_setzero_si128();
+
+    for (j=0;j<(len-7);j+=8)
+    {
+        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
+        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
+        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
+        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
+        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
+
+        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
+        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
+        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
+        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
+    }
+
+    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
+    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
+
+    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
+    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
+
+    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
+    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
+
+    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
+    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
+
+    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
+          _mm_unpacklo_epi32(sum2, sum3));
+
+    for (;j<(len-3);j+=4)
+    {
+        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+        sum3 = _mm_mullo_epi32(vecX3, vecY3);
+
+        sum0 = _mm_add_epi32(sum0, sum1);
+        sum2 = _mm_add_epi32(sum2, sum3);
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+
+    for (;j<len;j++)
+    {
+        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        vecSum = _mm_add_epi32(vecSum, sum0);
+    }
+
+    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
+    initSum = _mm_add_epi32(initSum, vecSum);
+    _mm_storeu_si128((__m128i *)sum, initSum);
+}
+#endif
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
index 83410db..1ed2acb 100644
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -38,6 +38,8 @@
 
 # if defined(FIXED_POINT)
 
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1)
+
 void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
          const opus_val16 *num,
@@ -49,8 +51,8 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
 ) = {
   celt_fir_c,                /* non-sse */
   celt_fir_c,
+  celt_fir_c,
   MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
-  NULL
 };
 
 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
@@ -61,24 +63,86 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
 ) = {
   xcorr_kernel_c,                /* non-sse */
   xcorr_kernel_c,
+  xcorr_kernel_c,
   MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
-  NULL
 };
 
+#endif
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1)) ||  \
+	(!defined(OPUS_X86_MAY_HAVE_SSE_4_1) &&
defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
+
 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
          const opus_val16 *y,
          int              N
 ) = {
   celt_inner_prod_c,                /* non-sse */
+  celt_inner_prod_c,
   MAY_HAVE_SSE2(celt_inner_prod),
   MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
-  NULL
 };
 
+#endif
+
 # else
-#  error "Floating-point implementation is not supported by x86 RTCD
yet." \
- "Reconfigure with --disable-rtcd or send patches."
-# endif
 
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         opus_val32       sum[4],
+         int              len
+) = {
+  xcorr_kernel_c,                /* non-sse */
+  MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel),
+};
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         int              N
+) = {
+  celt_inner_prod_c,                /* non-sse */
+  MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod),
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_val16 *x,
+                    const opus_val16 *y01,
+                    const opus_val16 *y02,
+                    int               N,
+                    opus_val32       *xy1,
+                    opus_val32       *xy2
+) = {
+  dual_inner_prod_c,                /* non-sse */
+  MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod),
+};
+
+void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+              opus_val32 *y,
+              opus_val32 *x,
+              int         T,
+              int         N,
+              opus_val16  g10,
+              opus_val16  g11,
+              opus_val16  g12
+) = {
+  comb_filter_const_c,                /* non-sse */
+  MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const),
+};
+
+
+#endif
+
+#endif
 #endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index c82a4b7..afcdeb6 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,10 +35,19 @@
 #include "pitch.h"
 #include "x86cpu.h"
 
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE))
|| \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
|| \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1))
+
+
 #if defined(_MSC_VER)
 
 #include <intrin.h>
-#define cpuid(info,x) __cpuid(info,x)
+static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+	__cpuid((int*)CPUInfo, InfoType);
+}
+
 #else
 
 #if defined(CPU_INFO_BY_C)
@@ -48,14 +57,28 @@
 static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
 {
 #if defined(CPU_INFO_BY_ASM)
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+    __asm__ __volatile__ (
+       "xchg %%ebx, %1\n"
+       "cpuid\n"
+       "xchg %%ebx, %1\n":
+        "=a" (CPUInfo[0]),
+        "=r" (CPUInfo[1]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "0" (InfoType)
+    );
+#else
     __asm__ __volatile__ (
         "cpuid":
         "=a" (CPUInfo[0]),
         "=b" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "a" (InfoType), "c" (0)
+        "0" (InfoType)
     );
+#endif
 #elif defined(CPU_INFO_BY_C)
     __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]),
&(CPUInfo[2]), &(CPUInfo[3]));
 #endif
@@ -63,11 +86,9 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int
InfoType)
 
 #endif
 
-#include "SigProc_FIX.h"
-#include "celt_lpc.h"
-
 typedef struct CPU_Feature{
     /*  SIMD: 128-bit */
+    int HW_SSE;
     int HW_SSE2;
     int HW_SSE41;
 } CPU_Feature;
@@ -82,19 +103,31 @@ static void opus_cpu_feature_check(CPU_Feature
*cpu_feature)
 
     if (nIds >= 1){
         cpuid(info, 1);
+        cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
     }
+    else {
+        cpu_feature->HW_SSE = 0;
+        cpu_feature->HW_SSE2 = 0;
+        cpu_feature->HW_SSE41 = 0;
+    }
 }
 
 int opus_select_arch(void)
 {
-    CPU_Feature cpu_feature = {0};
+    CPU_Feature cpu_feature;
     int arch;
 
     opus_cpu_feature_check(&cpu_feature);
 
     arch = 0;
+    if (!cpu_feature.HW_SSE)
+    {
+       return arch;
+    }
+    arch++;
+
     if (!cpu_feature.HW_SSE2)
     {
        return arch;
@@ -109,3 +142,5 @@ int opus_select_arch(void)
 
     return arch;
 }
+
+#endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index ef53f0c..7f4c61d 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -28,6 +28,12 @@
 #if !defined(X86CPU_H)
 # define X86CPU_H
 
+# if defined(OPUS_X86_MAY_HAVE_SSE)
+#  define MAY_HAVE_SSE(name) name ## _sse
+# else
+#  define MAY_HAVE_SSE(name) name ## _c
+# endif
+
 # if defined(OPUS_X86_MAY_HAVE_SSE2)
 #  define MAY_HAVE_SSE2(name) name ## _sse2
 # else
@@ -55,21 +61,25 @@ int opus_select_arch(void);
   reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
   optimize this out when optimizations ARE enabled.
 
-  It appears clang requires us to do this always (which is fair, since
-  technically the compiler is always allowed to do the dereference before
-  invoking the function implementing the intrinsic). I have not investiaged
-  whether it is any smarter than gcc when it comes to eliminating the extra
-  load instruction.*/
+  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
+  (which is fair, since technically the compiler is always allowed to do the
+  dereference before invoking the function implementing the intrinsic).
+  However, it is smart enough to eliminate the extra MOVD instruction. 
+  For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize
out
+  the extra MOVQ if it's specified explicitly */
+
 # if defined(__clang__) || !defined(__OPTIMIZE__)
 #  define OP_CVTEPI8_EPI32_M32(x) \
  (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-
-#  define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
 # else
 #  define OP_CVTEPI8_EPI32_M32(x) \
  (_mm_cvtepi8_epi32(*(__m128i *)(x)))
+#endif
 
+# if !defined(__OPTIMIZE__)
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# else
 #  define OP_CVTEPI16_EPI32_M64(x) \
  (_mm_cvtepi16_epi32(*(__m128i *)(x)))
 # endif
diff --git a/celt_sources.mk b/celt_sources.mk
index 7121301..2ffe99a 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -21,7 +21,10 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \
 celt/x86/x86_celt_map.c \
 celt/x86/pitch_sse.c
 
-CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c
+CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c
+
+CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \
+celt/x86/pitch_sse4_1.c
 
 CELT_SOURCES_ARM = \
 celt/arm/armcpu.c \
diff --git a/configure.ac b/configure.ac
index baa3425..9b05fc1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -348,8 +348,24 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
 AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
     [test x"${asm_optimization%% *}" = x"ARM"])
 
-AM_CONDITIONAL([HAVE_SSE4_1], [false])
+AM_CONDITIONAL([HAVE_SSE], [false])
 AM_CONDITIONAL([HAVE_SSE2], [false])
+AM_CONDITIONAL([HAVE_SSE4_1], [false])
+
+m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse])
+m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2])
+m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1])
+m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon])
+
+AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics
@<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@])
+AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics
@<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@])
+AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics
@<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@])
+AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON
intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS[@:>@])
+
+AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS],
DEFAULT_X86_SSE_CFLAGS)])
+AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS],
DEFAULT_X86_SSE2_CFLAGS)])
+AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS],
DEFAULT_X86_SSE4_1_CFLAGS)])
+AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS],
DEFAULT_ARM_NEON_INTR_CFLAGS)])
 
 AC_DEFUN([OPUS_PATH_NE10],
    [
@@ -426,45 +442,53 @@ AC_DEFUN([OPUS_PATH_NE10],
 )
 
 AS_IF([test x"$enable_intrinsics" = x"yes"],[
-   case $host_cpu in
-   arm*)
+   intrinsics_support=""
+   AS_CASE([$host_cpu],
+   [arm*],
+   [
       cpu_arm=yes
-      AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics)
-      save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
-      AC_LINK_IFELSE(
-         [
-            AC_LANG_PROGRAM(
-               [[#include <arm_neon.h>
-               ]],
-               [[
-                  static float32x4_t A[2], SUMM;
-                  SUMM = vmlaq_f32(SUMM, A[0], A[1]);
-               ]]
-            )
-         ],[
-            OPUS_ARM_NEON_INTR=1
-            AC_MSG_RESULT([yes])
-         ],[
-            OPUS_ARM_NEON_INTR=0
-            AC_MSG_RESULT([no])
-         ]
+      OPUS_CHECK_INTRINSICS(
+         [ARM Neon],
+         [$ARM_NEON_INTR_CFLAGS],
+         [OPUS_ARM_MAY_HAVE_NEON_INTR],
+         [OPUS_ARM_PRESUME_NEON_INTR],
+         [[#include <arm_neon.h>
+         ]],
+         [[
+            static float32x4_t A0, A1, SUMM;
+            SUMM = vmlaq_f32(SUMM, A0, A1);
+         ]]
+      )
+      AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"
&& test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"],
+          [
+             OPUS_ARM_NEON_INTR_CFLAGS="$ARM_NEON_INTR_CFLAGS"
+             AC_SUBST([OPUS_ARM_NEON_INTR_CFLAGS])
+          ]
       )
-      CFLAGS="$save_CFLAGS"
-      #Now we know if compiler supports ARM neon intrinsics or not
 
-      #Currently we only have intrinsic optimization for floating point
+      #Currently we only have intrinsic optimizations for floating point
       AS_IF([test x"$enable_float" = x"yes"],
       [
-         AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"],
+         AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" =
x"1"],
          [
-            AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon
Intrinsics])
-            AS_IF([test x"enable_rtcd" != x""],
-               [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[])
-            enable_intrinsics="$enable_intrinsics
ARMv7_Neon_Intrinsics"
-            dnl Don't see why defining these is necessary to check features
at runtime
-            AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support
EDSP Instructions])
-            AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support
MEDIA Instructions])
-            AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support
NEON instructions])
+            AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1,
+                      [Compiler supports ARMv7 Neon Intrinsics])
+            intrinsics_support="$intrinsics_support
(Neon_Intrinsics)"
+
+            AS_IF([test x"enable_rtcd" != x"" &&
test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"],
+                  [rtcd_support="$rtcd_support
(ARMv7_Neon_Intrinsics)"],[])
+
+            AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" =
x"1"],
+                  [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1,
+                             [Define if binary requires NEON intrinsics
support])])
+
+			   AS_IF([test x"$rtcd_support" = x""],
+                  [rtcd_support=no])
+
+            AS_IF([test x"$intrinsics_support" = x""],
+                  [intrinsics_support=no],
+			         [intrinsics_support="arm$intrinsics_support"])
+
 
             OPUS_PATH_NE10()
             AS_IF([test x"$NE10_LIBS" != "x"],
@@ -472,18 +496,122 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
          ],
          [
             AC_MSG_WARN([Compiler does not support ARM intrinsics])
-            enable_intrinsics=no
+            intrinsics_support=no
          ])
       ], [
-            AC_MSG_WARN([Currently on have ARM intrinsics for float])
-            enable_intrinsics=no
+            AC_MSG_WARN([Currently only have ARM intrinsics for float])
+            intrinsics_support=no
       ])
-   ;;
-   "i386" | "i686" | "x86_64")
-    AS_IF([test x"$enable_float" = x"no"],[
-    AS_IF([test x"$enable_rtcd" = x"yes"],[
+   ],
+   [i?86|x86_64],
+   [
+      OPUS_CHECK_INTRINSICS(
+         [SSE],
+         [$X86_SSE_CFLAGS],
+         [OPUS_X86_MAY_HAVE_SSE],
+         [OPUS_X86_PRESUME_SSE],
+         [[#include <xmmintrin.h>
+         ]],
+         [[
+             static __m128 mtest;
+             mtest = _mm_setzero_ps();
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"
&& test x"$OPUS_X86_PRESUME_SSE" != x"1"],
+          [
+             OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS"
+             AC_SUBST([OPUS_X86_SSE_CFLAGS])
+          ]
+      )
+      OPUS_CHECK_INTRINSICS(
+         [SSE2],
+         [$X86_SSE2_CFLAGS],
+         [OPUS_X86_MAY_HAVE_SSE2],
+         [OPUS_X86_PRESUME_SSE2],
+         [[#include <emmintrin.h>
+         ]],
+         [[
+             static __m128i mtest;
+             mtest = _mm_setzero_si128();
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"
&& test x"$OPUS_X86_PRESUME_SSE2" != x"1"],
+          [
+             OPUS_X86_SSE2_CFLAGS="$X86_SSE2_CFLAGS"
+             AC_SUBST([OPUS_X86_SSE2_CFLAGS])
+          ]
+      )
+      OPUS_CHECK_INTRINSICS(
+         [SSE4.1],
+         [$X86_SSE4_1_CFLAGS],
+         [OPUS_X86_MAY_HAVE_SSE4_1],
+         [OPUS_X86_PRESUME_SSE4_1],
+         [[#include <smmintrin.h>
+         ]],
+         [[
+            static __m128i mtest;
+            mtest = _mm_setzero_si128();
+            mtest = _mm_cmpeq_epi64(mtest, mtest);
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"
&& test x"$OPUS_X86_PRESUME_SSE4_1" != x"1"],
+          [
+             OPUS_X86_SSE4_1_CFLAGS="$X86_SSE4_1_CFLAGS"
+             AC_SUBST([OPUS_X86_SSE4_1_CFLAGS])
+          ]
+      )
+
+         AS_IF([test x"$rtcd_support" = x"no"],
[rtcd_support=""])
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE], 1, [Compiler supports X86 SSE
Intrinsics])
+            intrinsics_support="$intrinsics_support SSE"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_SSE" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_SSE], 1, [Define if binary requires
SSE intrinsics support])],
+               [rtcd_support="$rtcd_support SSE"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support SSE intrinsics])
+         ])
+
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], 1, [Compiler supports X86 SSE2
Intrinsics])
+            intrinsics_support="$intrinsics_support SSE2"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_SSE2" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_SSE2], 1, [Define if binary
requires SSE2 intrinsics support])],
+               [rtcd_support="$rtcd_support SSE2"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support SSE2 intrinsics])
+         ])
+
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], 1, [Compiler supports X86
SSE4.1 Intrinsics])
+            intrinsics_support="$intrinsics_support SSE4.1"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_SSE4_1" =
x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_SSE4_1], 1, [Define if binary
requires SSE4.1 intrinsics support])],
+               [rtcd_support="$rtcd_support SSE4.1"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics])
+         ])
+         AS_IF([test x"$intrinsics_support" = x""],
+            [intrinsics_support=no],
+            [intrinsics_support="x86$intrinsics_support"]
+         )
+         AS_IF([test x"$rtcd_support" = x""],
+            [rtcd_support=no],
+            [rtcd_support="x86$rtcd_support"],
+        )
+
+    AS_IF([test x"$enable_rtcd" = x"yes" && test
x"$rtcd_support" != x""],[
             get_cpuid_by_asm="no"
-            AC_MSG_CHECKING([Get CPU Info])
+            AC_MSG_CHECKING([How to get X86 CPU Info])
             AC_LINK_IFELSE([AC_LANG_PROGRAM([[
                  #include <stdio.h>
             ]],[[
@@ -493,7 +621,7 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
                  unsigned int CPUInfo3;
                  unsigned int InfoType;
                  __asm__ __volatile__ (
-                 "cpuid11":
+                 "cpuid":
                  "=a" (CPUInfo0),
                  "=b" (CPUInfo1),
                  "=c" (CPUInfo2),
@@ -502,7 +630,8 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
                 );
             ]])],
             [get_cpuid_by_asm="yes"
-             AC_MSG_RESULT([Inline Assembly])],
+             AC_MSG_RESULT([Inline Assembly])
+			 AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])],
              [AC_LINK_IFELSE([AC_LANG_PROGRAM([[
                  #include <cpuid.h>
             ]],[[
@@ -513,90 +642,31 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
                  unsigned int InfoType;
                  __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1,
&CPUInfo2, &CPUInfo3);
             ]])],
-            [AC_MSG_RESULT([C method])],
-            [AC_MSG_ERROR([not support Get CPU Info, please disable intrinsics
])])])
-
-       AC_MSG_CHECKING([sse4.1])
-       TMP_CFLAGS="$CFLAGS"
-       gcc -Q --help=target | grep "\-msse4.1 "
-       AS_IF([test x"$?" = x"0"],[
-            CFLAGS="$CFLAGS -msse4.1"
-            AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find
xmmintrin.h])])
-            AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find
emmintrin.h])])
-            AC_CHECK_HEADER(smmintrin.h, [], [AC_MSG_ERROR([Couldn't find
smmintrin.h])],[
-            #ifdef HAVE_XMMINSTRIN_H
-                 #include <xmmintrin.h>
-                 #endif
-                 #ifdef HAVE_EMMINSTRIN_H
-                 #include <emmintrin.h>
-                 #endif
-            ])
-
-            AC_LINK_IFELSE([AC_LANG_PROGRAM([[
-                 #include <xmmintrin.h>
-                 #include <emmintrin.h>
-                 #include <smmintrin.h>
-            ]],[[
-                 __m128i mtest = _mm_setzero_si128();
-                 mtest = _mm_cmpeq_epi64(mtest, mtest);
-            ]])],
-            [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker
failure for sse4.1, please disable intrinsics])])
-
-            CFLAGS="$TMP_CFLAGS"
-            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], [1], [For x86 sse4.1
instrinsics optimizations])
-            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics
optimizations])
-            rtcd_support="x86 sse4.1"
-            AM_CONDITIONAL([HAVE_SSE4_1], [true])
-            AM_CONDITIONAL([HAVE_SSE2], [true])
-            AS_IF([test x"$get_cpuid_by_asm" =
x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm
method])],
-            [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])])
-             ],[ ##### Else case for AS_IF([test x"$?" =
x"0"])
-               gcc -Q --help=target | grep "\-msse2 "
-               AC_MSG_CHECKING([sse2])
-               AS_IF([test x"$?" = x"0"],[
-                   AC_MSG_RESULT([yes])
-                   CFLAGS="$CFLAGS -msse2"
-                   AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't
find xmmintrin.h])])
-                   AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't
find emmintrin.h])])
-
-                   AC_LINK_IFELSE([AC_LANG_PROGRAM([[
-                        #include <xmmintrin.h>
-                        #include <emmintrin.h>
-                   ]],[[
-                        __m128i mtest = _mm_setzero_si128();
-                   ]])],
-                   [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker
failure for sse2, please disable intrinsics])])
-
-                  CFLAGS="$TMP_CFLAGS"
-                  AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2
instrinsics optimize])
-                  rtcd_support="x86 sse2"
-                  AM_CONDITIONAL([HAVE_SSE2], [true])
-                  AS_IF([test x"$get_cpuid_by_asm" =
x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm
method])],
-                  [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c
method])])
-            ],[enable_intrinsics="no"]) #End of AS_IF([test
x"$?" = x"0"]
-        ])
-    ], [
-        enable_intrinsics="no"
-    ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"]
-],
-[  ## Else case for AS_IF([test x"$enable_float" = x"no"]
-   AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed
point])
-   enable_intrinsics="no"
-]) ## End of AS_IF([test x"$enable_float" = x"no"]
-   ;;
-   *)
+            [AC_MSG_RESULT([C method])
+			 AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])],
+            [AC_MSG_ERROR([no supported Get CPU Info method, please disable
intrinsics])])])])
+   ],
+   [
       AC_MSG_WARN([No intrinsics support for your architecture])
-      enable_intrinsics="no"
-   ;;
-   esac
+      intrinsics_support="no"
+   ])
+],
+[
+   intrinsics_support="no"
 ])
 
 AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
 AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
-    [test x"$OPUS_ARM_NEON_INTR" = x"1"])
+    [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
 AM_CONDITIONAL([HAVE_ARM_NE10],
     [test x"$HAVE_ARM_NE10" = x"1"])
 
+AM_CONDITIONAL([HAVE_SSE],
+    [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"])
+AM_CONDITIONAL([HAVE_SSE2],
+    [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"])
+AM_CONDITIONAL([HAVE_SSE4_1],
+    [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"])
 
 AS_IF([test x"$enable_rtcd" = x"yes"],[
     AS_IF([test x"$rtcd_support" != x"no"],[
@@ -704,7 +774,7 @@ AC_MSG_NOTICE([
       Fixed point debugging: ......... ${enable_fixed_point_debug}
       Inline Assembly Optimizations: . ${inline_optimization}
       External Assembly Optimizations: ${asm_optimization}
-      Intrinsics Optimizations.......: ${enable_intrinsics}
+      Intrinsics Optimizations.......: ${intrinsics_support}
       Run-time CPU detection: ........ ${rtcd_support}
       Custom modes: .................. ${enable_custom_modes}
       Assertion checking: ............ ${enable_assertions}
diff --git a/m4/opus-intrinsics.m4 b/m4/opus-intrinsics.m4
new file mode 100644
index 0000000..c74aecd
--- /dev/null
+++ b/m4/opus-intrinsics.m4
@@ -0,0 +1,29 @@
+dnl opus-intrinsics.m4
+dnl macro for testing for support for compiler intrinsics, either by default or
with a compiler flag
+
+dnl OPUS_CHECK_INTRINSICS(NAME-OF-INTRINSICS, COMPILER-FLAG-FOR-INTRINSICS,
VAR-IF-PRESENT, VAR-IF-DEFAULT, TEST-PROGRAM-HEADER, TEST-PROGRAM-BODY)
+AC_DEFUN([OPUS_CHECK_INTRINSICS],
+[
+   AC_MSG_CHECKING([if compiler supports $1 intrinsics])
+   AC_LINK_IFELSE(
+     [AC_LANG_PROGRAM($5, $6)],
+     [
+        $3=1
+ 	$4=1
+        AC_MSG_RESULT([yes])
+      ],[
+        $4=0
+        AC_MSG_RESULT([no])
+        AC_MSG_CHECKING([if compiler supports $1 intrinsics with $2])
+        save_CFLAGS="$CFLAGS"; CFLAGS="$2 $CFLAGS"
+        AC_LINK_IFELSE([AC_LANG_PROGRAM($5, $6)],
+        [
+           AC_MSG_RESULT([yes])
+           $3=1
+        ],[
+           AC_MSG_RESULT([no])
+           $3=0
+        ])
+        CFLAGS="$save_CFLAGS"
+     ])
+])
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 9a0e096..61efa8d 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1(
     int                         arch                /* I    Run-time
architecture                                       */
 );
 
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30,
subfr_length, nb_subfr, D, arch) \
+    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x,
minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#else
+
 extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     opus_int32                  *res_nrg,           /* O    Residual energy    
*/
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q
value                                     */
@@ -59,12 +65,22 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK +
1])(
 #  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30,
subfr_length, nb_subfr, D, arch) \
     ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q,
A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
+#endif
+
 opus_int64 silk_inner_prod16_aligned_64_sse4_1(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
 );
 
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+
+#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+
+#else
+
 extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK +
1])(
                     const opus_int16 *inVec1,
                     const opus_int16 *inVec2,
@@ -75,3 +91,4 @@ extern opus_int64 (*const
SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
 
 #endif
 #endif
+#endif
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index f970632..afd5ec2 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1(
     opus_int                    L                               /* I    number
of vectors in codebook               */
 );
 
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7,
cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14,
W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L))
+
+#else
+
 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     opus_int8                   *ind,                           /* O    index
of best codebook vector               */
     opus_int32                  *rate_dist_Q14,                 /* O    best
weighted quant error + mu * rate       */
@@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14,
gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
                           mu_Q9, max_gain_Q7, L))
 
+#endif
+
 #  define OVERRIDE_silk_NSQ
 
 void silk_NSQ_sse4_1(
@@ -89,6 +100,15 @@ void silk_NSQ_sse4_1(
     const opus_int              LTP_scale_Q14                               /*
I    LTP state scaling               */
 );
 
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12,
LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL,
Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses,
PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL,
Lambda_Q10, LTP_scale_Q14))
+
+#else
+
 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state    *psEncC,                                    /*
I/O  Encoder State                   */
     silk_nsq_state              *NSQ,                                       /*
I/O  NSQ state                       */
@@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3,
pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL,
Lambda_Q10, LTP_scale_Q14))
 
+#endif
+
 #  define OVERRIDE_silk_NSQ_del_dec
 
 void silk_NSQ_del_dec_sse4_1(
@@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1(
     const opus_int              LTP_scale_Q14                               /*
I    LTP state scaling               */
 );
 
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12,
LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses,
PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#else
+
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state    *psEncC,                                    /*
I/O  Encoder State                   */
     silk_nsq_state              *NSQ,                                       /*
I/O  NSQ state                       */
@@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK +
1])(
     ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ,
psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
pitchL, Lambda_Q10, LTP_scale_Q14))
 
+#endif
+
 void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state              
*/
     opus_int            signalType,             /* I    Signal type            
*/
@@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(
     const opus_int16   pIn[]
 );
 
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_VAD_GetSA_Q8(psEnC, pIn, arch)
((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+
+#else
+
 #  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
 
@@ -201,6 +239,8 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK
+ 1])(
 
 #  define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
 
+#endif
+
 void silk_warped_LPC_analysis_filter_FIX_sse4_1(
           opus_int32            state[],                    /* I/O  State
[order + 1]                   */
           opus_int32            res_Q2[],                   /* O    Residual
signal [length]            */
@@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1(
     const opus_int              order                       /* I    Filter
order (even)                 */
 );
 
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input,
lambda_Q16, length, order, arch) \
+    ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2,
coef_Q13, input, lambda_Q16, length, order))
+
+#else
+
 extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK +
1])(
           opus_int32            state[],                    /* I/O  State
[order + 1]                   */
           opus_int32            res_Q2[],                   /* O    Residual
signal [length]            */
@@ -224,5 +270,7 @@ extern void (*const
SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])
 #  define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input,
lambda_Q16, length, order, arch) \
     ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) &
OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
 
+#endif
+
 # endif
 #endif
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index 6747d10..ad9fef2 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -35,6 +35,10 @@
 #include "pitch.h"
 #include "main.h"
 
+#if !defined(OPUS_X86_PRESUME_SSE4_1)
+
+#if defined(FIXED_POINT)
+
 opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
@@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[
OPUS_ARCHMASK + 1 ] )(
 ) = {
   silk_inner_prod16_aligned_64_c,                  /* non-sse */
   silk_inner_prod16_aligned_64_c,
+  silk_inner_prod16_aligned_64_c,
   MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
-  NULL
 };
 
+#endif
+
 opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
     silk_encoder_state *psEncC,
     const opus_int16   pIn[]
 ) = {
   silk_VAD_GetSA_Q8_c,                  /* non-sse */
   silk_VAD_GetSA_Q8_c,
+  silk_VAD_GetSA_Q8_c,
   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
-  NULL
 };
 
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
 ) = {
   silk_NSQ_c,                  /* non-sse */
   silk_NSQ_c,
+  silk_NSQ_c,
   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
-  NULL
 };
 
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
 ) = {
   silk_VQ_WMat_EC_c,                  /* non-sse */
   silk_VQ_WMat_EC_c,
+  silk_VQ_WMat_EC_c,
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
-  NULL
 };
 
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
 ) = {
   silk_NSQ_del_dec_c,                  /* non-sse */
   silk_NSQ_del_dec_c,
+  silk_NSQ_del_dec_c,
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
-  NULL
 };
 
+#if defined(FIXED_POINT)
+
 void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(
     opus_int32                  state[],                    /* I/O  State
[order + 1]                   */
     opus_int32                  res_Q2[],                   /* O    Residual
signal [length]            */
@@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[
OPUS_ARCHMASK + 1 ] )(
 ) = {
   silk_warped_LPC_analysis_filter_FIX_c,                  /* non-sse */
   silk_warped_LPC_analysis_filter_FIX_c,
+  silk_warped_LPC_analysis_filter_FIX_c,
   MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
-  NULL
 };
 
 void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
 ) = {
   silk_burg_modified_c,                  /* non-sse */
   silk_burg_modified_c,
+  silk_burg_modified_c,
   MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
-  NULL
 };
+
+#endif
+#endif
diff --git a/win32/VS2010/celt.vcxproj b/win32/VS2010/celt.vcxproj
index f107fec..e068fbe 100644
--- a/win32/VS2010/celt.vcxproj
+++ b/win32/VS2010/celt.vcxproj
@@ -37,6 +37,12 @@
     <ClCompile Include="..\..\celt\quant_bands.c" />
     <ClCompile Include="..\..\celt\rate.c" />
     <ClCompile Include="..\..\celt\vq.c" />
+    <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c" />
+    <ClCompile Include="..\..\celt\x86\pitch_sse.c" />
+    <ClCompile Include="..\..\celt\x86\pitch_sse2.c" />
+    <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c" />
+    <ClCompile Include="..\..\celt\x86\x86cpu.c" />
+    <ClCompile Include="..\..\celt\x86\x86_celt_map.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\celt\arch.h" />
@@ -67,6 +73,9 @@
     <ClInclude Include="..\..\celt\static_modes_fixed.h" />
     <ClInclude Include="..\..\celt\static_modes_float.h" />
     <ClInclude Include="..\..\celt\vq.h" />
+    <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h" />
+    <ClInclude Include="..\..\celt\x86\pitch_sse.h" />
+    <ClInclude Include="..\..\celt\x86\x86cpu.h" />
     <ClInclude Include="..\..\celt\_kiss_fft_guts.h" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
@@ -141,7 +150,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -168,7 +177,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -196,7 +205,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -227,7 +236,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
diff --git a/win32/VS2010/celt.vcxproj.filters
b/win32/VS2010/celt.vcxproj.filters
index e3a1d97..e9948fa 100644
--- a/win32/VS2010/celt.vcxproj.filters
+++ b/win32/VS2010/celt.vcxproj.filters
@@ -69,6 +69,24 @@
     <ClCompile Include="..\..\celt\celt.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\pitch_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\pitch_sse2.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\x86_celt_map.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\x86cpu.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\celt\cwrs.h">
@@ -158,5 +176,14 @@
     <ClInclude Include="..\..\celt\celt_lpc.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\celt\x86\pitch_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\celt\x86\x86cpu.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/win32/VS2010/silk_common.vcxproj b/win32/VS2010/silk_common.vcxproj
index 9cf5f48..d3d077d 100644
--- a/win32/VS2010/silk_common.vcxproj
+++ b/win32/VS2010/silk_common.vcxproj
@@ -88,7 +88,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -118,7 +118,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -149,7 +149,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
@@ -184,7 +184,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
@@ -212,6 +212,8 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="..\..\include\opus_types.h" />
+    <ClInclude Include="..\..\silk\x86\main_sse.h" />
+    <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h" />
     <ClInclude Include="..\..\win32\config.h" />
     <ClInclude Include="..\..\silk\control.h" />
     <ClInclude Include="..\..\silk\debug.h" />
@@ -311,8 +313,13 @@
     <ClCompile Include="..\..\silk\table_LSF_cos.c" />
     <ClCompile Include="..\..\silk\VAD.c" />
     <ClCompile Include="..\..\silk\VQ_WMat_EC.c" />
+    <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c" />
+    <ClCompile Include="..\..\silk\x86\NSQ_sse.c" />
+    <ClCompile Include="..\..\silk\x86\VAD_sse.c" />
+    <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c" />
+    <ClCompile Include="..\..\silk\x86\x86_silk_map.c" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/win32/VS2010/silk_common.vcxproj.filters
b/win32/VS2010/silk_common.vcxproj.filters
index 30db48e..341180b 100644
--- a/win32/VS2010/silk_common.vcxproj.filters
+++ b/win32/VS2010/silk_common.vcxproj.filters
@@ -81,6 +81,12 @@
     <ClInclude Include="..\..\silk\typedef.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\silk\x86\main_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\silk\VQ_WMat_EC.c">
@@ -311,5 +317,20 @@
     <ClCompile Include="..\..\silk\VAD.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\NSQ_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\VAD_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\x86_silk_map.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/win32/VS2010/silk_fixed.vcxproj b/win32/VS2010/silk_fixed.vcxproj
index 5ea1a91..522101e 100644
--- a/win32/VS2010/silk_fixed.vcxproj
+++ b/win32/VS2010/silk_fixed.vcxproj
@@ -86,7 +86,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -104,7 +104,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -123,7 +123,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -145,7 +145,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
      
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -191,8 +191,11 @@
     <ClCompile Include="..\..\silk\fixed\solve_LS_FIX.c" />
     <ClCompile Include="..\..\silk\fixed\vector_ops_FIX.c" />
     <ClCompile
Include="..\..\silk\fixed\warped_autocorrelation_FIX.c" />
+    <ClCompile
Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c" />
+    <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c"
/>
+    <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c"
/>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/win32/VS2010/silk_fixed.vcxproj.filters
b/win32/VS2010/silk_fixed.vcxproj.filters
index 6897930..c2327eb 100644
--- a/win32/VS2010/silk_fixed.vcxproj.filters
+++ b/win32/VS2010/silk_fixed.vcxproj.filters
@@ -18,16 +18,16 @@
     <ClInclude Include="..\..\win32\config.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="main_FIX.h">
+    <ClInclude Include="..\..\include\opus_types.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\SigProc_FIX.h">
+    <ClInclude Include="..\..\silk\SigProc_FIX.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="structs_FIX.h">
+    <ClInclude Include="..\..\silk\fixed\main_FIX.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\include\opus_types.h">
+    <ClInclude Include="..\..\silk\fixed\structs_FIX.h">
       <Filter>Header Files</Filter>
     </ClInclude>
   </ItemGroup>
@@ -107,5 +107,14 @@
     <ClCompile
Include="..\..\silk\fixed\LTP_analysis_filter_FIX.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile
Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile
Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile
Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/win32/config.h b/win32/config.h
index 46ff699..10fbf33 100644
--- a/win32/config.h
+++ b/win32/config.h
@@ -35,9 +35,28 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #define OPUS_BUILD            1
 
-/* Enable SSE functions, if compiled with SSE/SSE2 (note that AMD64 implies
SSE2) */
-#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))
-#define __SSE__               1
+#if defined(_M_IX86) || defined(_M_X64)
+/* Can always build with SSE intrinsics (no special compiler flags necessary)
*/
+#define OPUS_X86_MAY_HAVE_SSE
+#define OPUS_X86_MAY_HAVE_SSE2
+#define OPUS_X86_MAY_HAVE_SSE4_1
+
+/* Presume SSE functions, if compiled with SSE/SSE2/AVX (note that AMD64
implies SSE2, and AVX
+   implies SSE4.1) */
+#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) ||
defined(__AVX__)
+#define OPUS_X86_PRESUME_SSE 1
+#endif
+#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) ||
defined(__AVX__)
+#define OPUS_X86_PRESUME_SSE2 1
+#endif
+#if defined(__AVX__)
+#define OPUS_X86_PRESUME_SSE4_1 1
+#endif
+
+#if !defined(OPUS_X86_PRESUME_SSE4_1) || !defined(OPUS_X86_PRESUME_SSE2) ||
!defined(OPUS_X86_PRESUME_SSE)
+#define OPUS_HAVE_RTCD 1
+#endif
+
 #endif
 
 #include "version.h"
-- 
1.9.1

Viswanath Puttagunta

2015-Mar-12 18:06 UTC

head link

[opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.

Hi All,

I have rebased Jonathan?s patch [1] (which I will address as RFCv1) on
top of my ARM neon intrinsics/NE10 work. [2], [3]

I have reviewed the patch, rebased it and make sure it compiles.

Compile configure options checked
  ../opus/configure
  ../opus/configure ?enable-intrinscs
  ../opus/configure ?host=arm-linux-gnueabihf
  ../opus/configure ?host=arm-linux-gnueabihf ?enable-intrinsics
  ../opus/configure ?host=aarch64-gnu-linux
  ../opus/configure ?host=aarch64-gnu-linux

I verified it works on x86 (SSE4.1). (Encode/Decode celt basic tests)
I will verify on ARM(64) hardware later on. (on road now)
I will base my future work on this patch.

This patch, in addition to ARM NE10 patches posted [2], [3]
are available at [4] for any reference.

Note that I don?t have experience with x86 intrinsics/instructions.
So, I reviewed it from generic code fixes and any impact on ARM
perspective.

Please provide feedback/review so this work can be merged. and we don?t
have to keep rebasing our future work.


[1]: http://lists.xiph.org/pipermail/opus/2015-March/002899.html
[2]: http://lists.xiph.org/pipermail/opus/2015-March/002905.html
[3]: http://lists.xiph.org/pipermail/opus/2015-March/002919.html
[4]:  https://git.linaro.org/people/viswanath.puttagunta/opus.git
       Branch: opus-dev-pending


Regards,
Vish


On 12 March 2015 at 12:45, Viswanath Puttagunta
<viswanath.puttagunta at linaro.org> wrote:> From: Jonathan Lennox <jonathan at vidyo.com>
>
> * Makes ?enable-intrinsics work with clang and other non-GCC compilers
> * Enables RTCD for the floating-point-mode SSE code in Celt.
> * Disables use of RTCD in cases where the compiler targets an instruction
set by default.
> * Enables the SSE4.1 Silk optimizations that apply to the common parts of
Silk when Opus is built in floating-point mode, not just in fixed-point mode.
> * Enables the SSE intrinsics (with RTCD when appropriate) in the Win32
build.
> * Fixes a case where GCC would compile SSE2 code as SSE4.1, causing a crash
on non-SSE4.1 CPUs.
> * Allows configuration with compilers with non-GCC-flavor flags for
enabling architecture options.
> * Hopefully makes the configuration and ifdef?s easier to follow and
understand.
>
> Reviewed-by: Viswanath Puttagunta <viswanath.puttagunta at
linaro.org>
> ---
>  Makefile.am                              |  38 ++--
>  celt/arm/armcpu.c                        |   6 +-
>  celt/arm/pitch_arm.h                     |   4 +-
>  celt/bands.c                             |   6 +-
>  celt/celt.c                              |  16 +-
>  celt/celt.h                              |  12 +-
>  celt/celt_decoder.c                      |   6 +-
>  celt/celt_encoder.c                      |   4 +-
>  celt/celt_lpc.h                          |   2 +-
>  celt/cpu_support.h                       |  15 +-
>  celt/mips/celt_mipsr1.h                  |   2 +-
>  celt/pitch.c                             |   4 +-
>  celt/pitch.h                             |  19 +-
>  celt/tests/test_unit_dft.c               |   4 +-
>  celt/tests/test_unit_mathops.c           |  11 +-
>  celt/tests/test_unit_mdct.c              |   4 +-
>  celt/tests/test_unit_rotation.c          |  11 +-
>  celt/x86/celt_lpc_sse.c                  |   4 +
>  celt/x86/celt_lpc_sse.h                  |  12 +-
>  celt/x86/pitch_sse.c                     | 334
+++++++++++++------------------
>  celt/x86/pitch_sse.h                     | 256 ++++++++++-------------
>  celt/x86/pitch_sse2.c                    |  95 +++++++++
>  celt/x86/pitch_sse4_1.c                  | 195 ++++++++++++++++++
>  celt/x86/x86_celt_map.c                  |  76 ++++++-
>  celt/x86/x86cpu.c                        |  47 ++++-
>  celt/x86/x86cpu.h                        |  26 ++-
>  celt_sources.mk                          |   5 +-
>  configure.ac                             | 312
++++++++++++++++++-----------
>  m4/opus-intrinsics.m4                    |  29 +++
>  silk/x86/SigProc_FIX_sse.h               |  17 ++
>  silk/x86/main_sse.h                      |  48 +++++
>  silk/x86/x86_silk_map.c                  |  25 ++-
>  win32/VS2010/celt.vcxproj                |  17 +-
>  win32/VS2010/celt.vcxproj.filters        |  27 +++
>  win32/VS2010/silk_common.vcxproj         |  17 +-
>  win32/VS2010/silk_common.vcxproj.filters |  23 ++-
>  win32/VS2010/silk_fixed.vcxproj          |  13 +-
>  win32/VS2010/silk_fixed.vcxproj.filters  |  17 +-
>  win32/config.h                           |  25 ++-
>  39 files changed, 1210 insertions(+), 574 deletions(-)
>  create mode 100644 celt/x86/pitch_sse2.c
>  create mode 100644 celt/x86/pitch_sse4_1.c
>  create mode 100644 m4/opus-intrinsics.m4
>
> diff --git a/Makefile.am b/Makefile.am
> index c5c1562..3a75740 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -23,6 +23,9 @@ SILK_SOURCES += $(SILK_SOURCES_SSE4_1)
$(SILK_SOURCES_FIXED_SSE4_1)
>  endif
>  else
>  SILK_SOURCES += $(SILK_SOURCES_FLOAT)
> +if HAVE_SSE4_1
> +SILK_SOURCES += $(SILK_SOURCES_SSE4_1)
> +endif
>  endif
>
>  if DISABLE_FLOAT_API
> @@ -30,12 +33,14 @@ else
>  OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
>  endif
>
> -if HAVE_SSE4_1
> -CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1)
> -else
> -if HAVE_SSE2
> +if HAVE_SSE
>  CELT_SOURCES += $(CELT_SOURCES_SSE)
>  endif
> +if HAVE_SSE2
> +CELT_SOURCES += $(CELT_SOURCES_SSE2)
> +endif
> +if HAVE_SSE4_1
> +CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
>  endif
>
>  if CPU_ARM
> @@ -44,7 +49,6 @@ SILK_SOURCES += $(SILK_SOURCES_ARM)
>
>  if OPUS_ARM_NEON_INTR
>  CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
> -OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon
>  endif
>
>  if HAVE_ARM_NE10
> @@ -262,20 +266,30 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S):
$(top_srcdir)/celt/arm/arm2gnu.pl
>  %-gnu.S: %.s
>         $(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< >
$@
>
> -SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o
> +OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
> +                    $(celt_tests_test_unit_rotation_SOURCES:.c=.o)
> +
> +if HAVE_SSE
> +SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
> +$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
> +endif
>
> -if HAVE_SSE4_1
> -$(SSE_OBJ): CFLAGS += -msse4.1
> -else
>  if HAVE_SSE2
> -$(SSE_OBJ): CFLAGS += -msse2
> +SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
> +$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
>  endif
> +
> +if HAVE_SSE4_1
> +SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
> +             $(SILK_SOURCES_SSE4_1:.c=.lo) \
> +             $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
> +$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
>  endif
>
>  if OPUS_ARM_NEON_INTR
>  CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
>                           $(CELT_SOURCES_ARM_NE10:.c=.lo) \
> -                         %test_unit_rotation.o %test_unit_mathops.o \
>                           %test_unit_mdct.o %test_unit_dft.o
> -$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS)
$(NE10_CFLAGS)
> +
> +$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS +=
$(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS)
>  endif
> diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
> index 1768525..5e5d10c 100644
> --- a/celt/arm/armcpu.c
> +++ b/celt/arm/armcpu.c
> @@ -73,7 +73,7 @@ static OPUS_INLINE opus_uint32
opus_cpu_capabilities(void){
>    __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
>      /*Ignore exception.*/
>    }
> -#   if defined(OPUS_ARM_MAY_HAVE_NEON)
> +#   if defined(OPUS_ARM_MAY_HAVE_NEON) ||
defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
>    __try{
>      /*VORR q0,q0,q0*/
>      __emit(0xF2200150);
> @@ -107,7 +107,7 @@ opus_uint32 opus_cpu_capabilities(void)
>
>      while(fgets(buf, 512, cpuinfo) != NULL)
>      {
> -# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
> +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) ||
defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
>        /* Search for edsp and neon flag */
>        if(memcmp(buf, "Features", 8) == 0)
>        {
> @@ -118,7 +118,7 @@ opus_uint32 opus_cpu_capabilities(void)
>            flags |= OPUS_CPU_ARM_EDSP;
>  #  endif
>
> -#  if defined(OPUS_ARM_MAY_HAVE_NEON)
> +#  if defined(OPUS_ARM_MAY_HAVE_NEON) ||
defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
>          p = strstr(buf, " neon");
>          if(p != NULL && (p[5] == ' ' || p[5] ==
'\n'))
>            flags |= OPUS_CPU_ARM_NEON;
> diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
> index 125d1bc..8626ed7 100644
> --- a/celt/arm/pitch_arm.h
> +++ b/celt/arm/pitch_arm.h
> @@ -54,10 +54,10 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x,
const opus_val16 *_y,
>
>  #else /* Start !FIXED_POINT */
>  /* Float case */
> -#if defined(OPUS_ARM_NEON_INTR)
> +#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
>  void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16
*_y,
>                                   opus_val32 *xcorr, int len, int
max_pitch);
> -#if !defined(OPUS_HAVE_RTCD)
> +#if !defined(OPUS_HAVE_RTCD) || defined(OPUS_ARM_PRESUME_NEON_INTR)
>  #define OVERRIDE_PITCH_XCORR (1)
>  #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
>     ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len,
max_pitch))
> diff --git a/celt/bands.c b/celt/bands.c
> index c643b09..25f229e 100644
> --- a/celt/bands.c
> +++ b/celt/bands.c
> @@ -398,7 +398,7 @@ static void stereo_split(celt_norm * OPUS_RESTRICT X,
celt_norm * OPUS_RESTRICT
>     }
>  }
>
> -static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm *
OPUS_RESTRICT Y, opus_val16 mid, int N)
> +static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm *
OPUS_RESTRICT Y, opus_val16 mid, int N, int arch)
>  {
>     int j;
>     opus_val32 xp=0, side=0;
> @@ -410,7 +410,7 @@ static void stereo_merge(celt_norm * OPUS_RESTRICT X,
celt_norm * OPUS_RESTRICT
>     opus_val32 t, lgain, rgain;
>
>     /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
> -   dual_inner_prod(Y, X, Y, N, &xp, &side);
> +   dual_inner_prod(Y, X, Y, N, &xp, &side, arch);
>     /* Compensating for the mid normalization */
>     xp = MULT16_32_Q15(mid, xp);
>     /* mid and side are in Q15, not Q14 like X and Y */
> @@ -1348,7 +1348,7 @@ static unsigned quant_band_stereo(struct band_ctx
*ctx, celt_norm *X, celt_norm
>     if (resynth)
>     {
>        if (N!=2)
> -         stereo_merge(X, Y, mid, N);
> +         stereo_merge(X, Y, mid, N, ctx->arch);
>        if (inv)
>        {
>           int j;
> diff --git a/celt/celt.c b/celt/celt.c
> index a610de4..40c62ce 100644
> --- a/celt/celt.c
> +++ b/celt/celt.c
> @@ -89,10 +89,12 @@ int resampling_factor(opus_int32 rate)
>     return ret;
>  }
>
> -#ifndef OVERRIDE_COMB_FILTER_CONST
>  /* This version should be faster on ARM */
>  #ifdef OPUS_ARM_ASM
> -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
> +#ifndef NON_STATIC_COMB_FILTER_CONST_C
> +static
> +#endif
> +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
>        opus_val16 g10, opus_val16 g11, opus_val16 g12)
>  {
>     opus_val32 x0, x1, x2, x3, x4;
> @@ -147,7 +149,10 @@ static void comb_filter_const(opus_val32 *y,
opus_val32 *x, int T, int N,
>  #endif
>  }
>  #else
> -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
> +#ifndef NON_STATIC_COMB_FILTER_CONST_C
> +static
> +#endif
> +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
>        opus_val16 g10, opus_val16 g11, opus_val16 g12)
>  {
>     opus_val32 x0, x1, x2, x3, x4;
> @@ -171,12 +176,11 @@ static void comb_filter_const(opus_val32 *y,
opus_val32 *x, int T, int N,
>
>  }
>  #endif
> -#endif
>
>  #ifndef OVERRIDE_comb_filter
>  void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
>        opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
> -      const opus_val16 *window, int overlap)
> +      const opus_val16 *window, int overlap, int arch)
>  {
>     int i;
>     /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
> @@ -234,7 +238,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0,
int T1, int N,
>     }
>
>     /* Compute the part with the constant filter. */
> -   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
> +   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch);
>  }
>  #endif /* OVERRIDE_comb_filter */
>
> diff --git a/celt/celt.h b/celt/celt.h
> index b196751..a423b95 100644
> --- a/celt/celt.h
> +++ b/celt/celt.h
> @@ -201,7 +201,17 @@ void celt_preemphasis(const opus_val16 * OPUS_RESTRICT
pcmp, celt_sig * OPUS_RES
>
>  void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
>        opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
> -      const opus_val16 *window, int overlap);
> +      const opus_val16 *window, int overlap, int arch);
> +
> +#ifdef NON_STATIC_COMB_FILTER_CONST_C
> +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
> +                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
> +#endif
> +
> +#ifndef OVERRIDE_COMB_FILTER_CONST
> +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch)            \
> +    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
> +#endif
>
>  void init_caps(const CELTMode *m,int *cap,int LM,int C);
>
> diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
> index 304f334..505a6ef 100644
> --- a/celt/celt_decoder.c
> +++ b/celt/celt_decoder.c
> @@ -699,7 +699,7 @@ static void celt_decode_lost(CELTDecoder *
OPUS_RESTRICT st, int N, int LM)
>           comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
>                st->postfilter_period, st->postfilter_period, overlap,
>                -st->postfilter_gain, -st->postfilter_gain,
> -              st->postfilter_tapset, st->postfilter_tapset, NULL,
0);
> +              st->postfilter_tapset, st->postfilter_tapset, NULL, 0,
st->arch);
>
>           /* Simulate TDAC on the concealed audio so that it blends with
the
>              MDCT of the next frame. */
> @@ -1011,11 +1011,11 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT
st, const unsigned char *dat
>        st->postfilter_period_old=IMAX(st->postfilter_period_old,
COMBFILTER_MINPERIOD);
>        comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old,
st->postfilter_period, mode->shortMdctSize,
>              st->postfilter_gain_old, st->postfilter_gain,
st->postfilter_tapset_old, st->postfilter_tapset,
> -            mode->window, overlap);
> +            mode->window, overlap, st->arch);
>        if (LM!=0)
>           comb_filter(out_syn[c]+mode->shortMdctSize,
out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch,
N-mode->shortMdctSize,
>                 st->postfilter_gain, postfilter_gain,
st->postfilter_tapset, postfilter_tapset,
> -               mode->window, overlap);
> +               mode->window, overlap, st->arch);
>
>     } while (++c<CC);
>     st->postfilter_period_old = st->postfilter_period;
> diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
> index 5f48638..1c9dbcb 100644
> --- a/celt/celt_encoder.c
> +++ b/celt/celt_encoder.c
> @@ -1166,11 +1166,11 @@ static int run_prefilter(CELTEncoder *st, celt_sig
*in, celt_sig *prefilter_mem,
>        if (offset)
>           comb_filter(in+c*(N+overlap)+overlap,
pre[c]+COMBFILTER_MAXPERIOD,
>                 st->prefilter_period, st->prefilter_period, offset,
-st->prefilter_gain, -st->prefilter_gain,
> -               st->prefilter_tapset, st->prefilter_tapset, NULL, 0);
> +               st->prefilter_tapset, st->prefilter_tapset, NULL, 0,
st->arch);
>
>        comb_filter(in+c*(N+overlap)+overlap+offset,
pre[c]+COMBFILTER_MAXPERIOD+offset,
>              st->prefilter_period, pitch_index, N-offset,
-st->prefilter_gain, -gain1,
> -            st->prefilter_tapset, prefilter_tapset, mode->window,
overlap);
> +            st->prefilter_tapset, prefilter_tapset, mode->window,
overlap, st->arch);
>        OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap);
>
>        if (N>COMBFILTER_MAXPERIOD)
> diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
> index dc8967f..323459e 100644
> --- a/celt/celt_lpc.h
> +++ b/celt/celt_lpc.h
> @@ -48,7 +48,7 @@ void celt_fir_c(
>           opus_val16 *mem,
>           int arch);
>
> -#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#if !defined(OVERRIDE_CELT_FIR)
>  #define celt_fir(x, num, y, N, ord, mem, arch) \
>      (celt_fir_c(x, num, y, N, ord, mem, arch))
>  #endif
> diff --git a/celt/cpu_support.h b/celt/cpu_support.h
> index 1d62e2f..5e99a90 100644
> --- a/celt/cpu_support.h
> +++ b/celt/cpu_support.h
> @@ -32,7 +32,8 @@
>  #include "opus_defines.h"
>
>  #if defined(OPUS_HAVE_RTCD) && \
> -  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
> +  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
> +
>  #include "arm/armcpu.h"
>
>  /* We currently support 4 ARM variants:
> @@ -43,14 +44,16 @@
>   */
>  #define OPUS_ARCHMASK 3
>
> -#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#elif (defined(OPUS_X86_MAY_HAVE_SSE) &&
!defined(OPUS_X86_PRESUME_SSE)) || \
> +  (defined(OPUS_X86_MAY_HAVE_SSE2) &&
!defined(OPUS_X86_PRESUME_SSE2)) || \
> +  (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1))
>
>  #include "x86/x86cpu.h"
> -/* We currently support 3 x86 variants:
> +/* We currently support 4 x86 variants:
>   * arch[0] -> non-sse
> - * arch[1] -> sse2
> - * arch[2] -> sse4.1
> - * arch[3] -> NULL
> + * arch[1] -> sse
> + * arch[2] -> sse2
> + * arch[3] -> sse4.1
>   */
>  #define OPUS_ARCHMASK 3
>  int opus_select_arch(void);
> diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h
> index 03915d8..7915d59 100644
> --- a/celt/mips/celt_mipsr1.h
> +++ b/celt/mips/celt_mipsr1.h
> @@ -56,7 +56,7 @@
>  #define OVERRIDE_comb_filter
>  void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
>        opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
> -      const opus_val16 *window, int overlap)
> +      const opus_val16 *window, int overlap, int arch)
>  {
>     int i;
>     opus_val32 x0, x1, x2, x3, x4;
> diff --git a/celt/pitch.c b/celt/pitch.c
> index 4364703..1d89cb0 100644
> --- a/celt/pitch.c
> +++ b/celt/pitch.c
> @@ -439,7 +439,7 @@ opus_val16 remove_doubling(opus_val16 *x, int
maxperiod, int minperiod,
>
>     T = T0 = *T0_;
>     ALLOC(yy_lookup, maxperiod+1, opus_val32);
> -   dual_inner_prod(x, x, x-T0, N, &xx, &xy);
> +   dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch);
>     yy_lookup[0] = xx;
>     yy=xx;
>     for (i=1;i<=maxperiod;i++)
> @@ -483,7 +483,7 @@ opus_val16 remove_doubling(opus_val16 *x, int
maxperiod, int minperiod,
>        {
>           T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);
>        }
> -      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
> +      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2,
arch);
>        xy += xy2;
>        yy = yy_lookup[T1] + yy_lookup[T1b];
>  #ifdef FIXED_POINT
> diff --git a/celt/pitch.h b/celt/pitch.h
> index 4368cc5..af745eb 100644
> --- a/celt/pitch.h
> +++ b/celt/pitch.h
> @@ -37,8 +37,8 @@
>  #include "modes.h"
>  #include "cpu_support.h"
>
> -#if defined(__SSE__) && !defined(FIXED_POINT) \
> - || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
> +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \
> +  || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) ||
defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT))
>  #include "x86/pitch_sse.h"
>  #endif
>
> @@ -135,8 +135,7 @@ static OPUS_INLINE void xcorr_kernel_c(const opus_val16
* x, const opus_val16 *
>  #endif /* OVERRIDE_XCORR_KERNEL */
>
>
> -#ifndef OVERRIDE_DUAL_INNER_PROD
> -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const
opus_val16 *y01, const opus_val16 *y02,
> +static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const
opus_val16 *y01, const opus_val16 *y02,
>        int N, opus_val32 *xy1, opus_val32 *xy2)
>  {
>     int i;
> @@ -150,6 +149,10 @@ static OPUS_INLINE void dual_inner_prod(const
opus_val16 *x, const opus_val16 *y
>     *xy1 = xy01;
>     *xy2 = xy02;
>  }
> +
> +#ifndef OVERRIDE_DUAL_INNER_PROD
> +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
> +    ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2))
>  #endif
>
>  /*We make sure a C version is always available for cases where the
overhead of
> @@ -169,6 +172,12 @@ static OPUS_INLINE opus_val32 celt_inner_prod_c(const
opus_val16 *x,
>      ((void)(arch),celt_inner_prod_c(x, y, N))
>  #endif
>
> +#ifdef NON_STATIC_COMB_FILTER_CONST_C
> +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
> +     opus_val16 g10, opus_val16 g11, opus_val16 g12);
> +#endif
> +
> +
>  #ifdef FIXED_POINT
>  opus_val32
>  #else
> @@ -180,7 +189,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const
opus_val16 *_y,
>  #if !defined(OVERRIDE_PITCH_XCORR)
>  /*Is run-time CPU detection enabled on this platform?*/
>  # if defined(OPUS_HAVE_RTCD) && \
> -  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
> +  (defined(OPUS_ARM_ASM) || (defined(OPUS_ARM_NEON_INTR) &&
!defined(OPUS_ARM_PRESUME_NEON_INTR)))
>  extern
>  #  if defined(FIXED_POINT)
>  opus_val32
> diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
> index 84f69bd..57691c6 100644
> --- a/celt/tests/test_unit_dft.c
> +++ b/celt/tests/test_unit_dft.c
> @@ -50,7 +50,7 @@
>  #include "entcode.c"
>
>  #if defined(OPUS_HAVE_RTCD) && \
> -         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
> +         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
>  #include "arm/armcpu.c"
>  #if defined(HAVE_ARM_NE10)
>  #include "arm/celt_ne10_fft.c"
> @@ -60,6 +60,8 @@
>  #include "arm/arm_celt_map.c"
>  #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
>  #include "x86/x86cpu.c"
> +#include "celt/x86/pitch_sse.c"
> +#include "x86/x86_celt_map.c"
>  #endif
>
>  #ifndef M_PI
> diff --git a/celt/tests/test_unit_mathops.c
b/celt/tests/test_unit_mathops.c
> index 0f1e4f1..379fbd5 100644
> --- a/celt/tests/test_unit_mathops.c
> +++ b/celt/tests/test_unit_mathops.c
> @@ -49,12 +49,21 @@
>  #include "cwrs.c"
>  #include "pitch.c"
>  #include "celt_lpc.c"
> +#include "celt.c"
>  #include "kiss_fft.c"
>  #include "mdct.c"
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
> +#if defined(OPUS_X86_MAY_HAVE_SSE) || \
> +    defined(OPUS_X86_MAY_HAVE_SSE2) || \
> +    defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#if defined(OPUS_X86_MAY_HAVE_SSE)
>  #include "x86/pitch_sse.c"
> +#endif
> +#if defined(OPUS_X86_MAY_HAVE_SSE2)
> +#include "x86/pitch_sse2.c"
> +#endif
>  #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#include "x86/pitch_sse4_1.c"
>  #include "x86/celt_lpc_sse.c"
>  #endif
>  #include "x86/x86_celt_map.c"
> diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
> index c64cac2..d8c4ef0 100644
> --- a/celt/tests/test_unit_mdct.c
> +++ b/celt/tests/test_unit_mdct.c
> @@ -49,7 +49,7 @@
>  #include "entcode.c"
>
>  #if defined(OPUS_HAVE_RTCD) && \
> -         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
> +         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
>  #include "arm/armcpu.c"
>  #if defined(HAVE_ARM_NE10)
>  #include "arm/celt_ne10_fft.c"
> @@ -60,6 +60,8 @@
>
>  #elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
>  #include "x86/x86cpu.c"
> +#include "celt/x86/pitch_sse.c"
> +#include "x86/x86_celt_map.c"
>  #endif
>
>  #ifndef M_PI
> diff --git a/celt/tests/test_unit_rotation.c
b/celt/tests/test_unit_rotation.c
> index ce14936..3cf54fa 100644
> --- a/celt/tests/test_unit_rotation.c
> +++ b/celt/tests/test_unit_rotation.c
> @@ -46,13 +46,22 @@
>  #include "bands.h"
>  #include "pitch.c"
>  #include "celt_lpc.c"
> +#include "celt.c"
>  #include "kiss_fft.c"
>  #include "mdct.c"
>  #include <math.h>
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
> +#if defined(OPUS_X86_MAY_HAVE_SSE) || \
> +    defined(OPUS_X86_MAY_HAVE_SSE2) || \
> +    defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#if defined(OPUS_X86_MAY_HAVE_SSE)
>  #include "x86/pitch_sse.c"
> +#endif
> +#if defined(OPUS_X86_MAY_HAVE_SSE2)
> +#include "x86/pitch_sse2.c"
> +#endif
>  #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#include "x86/pitch_sse4_1.c"
>  #include "x86/celt_lpc_sse.c"
>  #endif
>  #include "x86/x86_celt_map.c"
> diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
> index 9fb9779..67e5592 100644
> --- a/celt/x86/celt_lpc_sse.c
> +++ b/celt/x86/celt_lpc_sse.c
> @@ -38,6 +38,8 @@
>  #include "pitch.h"
>  #include "x86cpu.h"
>
> +#if defined(FIXED_POINT)
> +
>  void celt_fir_sse4_1(const opus_val16 *_x,
>           const opus_val16 *num,
>           opus_val16 *_y,
> @@ -126,3 +128,5 @@ void celt_fir_sse4_1(const opus_val16 *_x,
>  #endif
>     RESTORE_STACK;
>  }
> +
> +#endif
> diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
> index f111420..c5ec796 100644
> --- a/celt/x86/celt_lpc_sse.h
> +++ b/celt/x86/celt_lpc_sse.h
> @@ -32,7 +32,9 @@
>  #include "config.h"
>  #endif
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
> +#define OVERRIDE_CELT_FIR
> +
>  void celt_fir_sse4_1(
>           const opus_val16 *x,
>           const opus_val16 *num,
> @@ -42,6 +44,12 @@ void celt_fir_sse4_1(
>           opus_val16 *mem,
>           int arch);
>
> +#if defined(OPUS_X86_PRESUME_SSE4_1)
> +#define celt_fir(x, num, y, N, ord, mem, arch) \
> +    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
> +
> +#else
> +
>  extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
>           const opus_val16 *x,
>           const opus_val16 *num,
> @@ -56,3 +64,5 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
>
>  #endif
>  #endif
> +
> +#endif
> diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c
> index e3bc6d7..20e7312 100644
> --- a/celt/x86/pitch_sse.c
> +++ b/celt/x86/pitch_sse.c
> @@ -29,223 +29,157 @@
>  #include "config.h"
>  #endif
>
> -#include <xmmintrin.h>
> -#include <emmintrin.h>
> -
>  #include "macros.h"
>  #include "celt_lpc.h"
>  #include "stack_alloc.h"
>  #include "mathops.h"
>  #include "pitch.h"
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
> -#include <smmintrin.h>
> -#include "x86cpu.h"
> -
> -opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16
*y,
> -      int N)
> -{
> -    opus_int  i, dataSize16;
> -    opus_int32 sum;
> -    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
> -    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
> -    __m128i inVec1_3210, inVec2_3210;
> -
> -    sum = 0;
> -    dataSize16 = N & ~15;
> -
> -    acc1 = _mm_setzero_si128();
> -    acc2 = _mm_setzero_si128();
> -
> -    for (i=0;i<dataSize16;i+=16) {
> -        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> -        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> -
> -        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
> -        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
> -
> -        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> -        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98,
inVec2_FEDCBA98);
> -
> -        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> -        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
> -    }
> +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
>
> -    acc1 = _mm_add_epi32(acc1, acc2);
> -
> -    if (N - i >= 8)
> -    {
> -        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> -        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> -
> -        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> -
> -        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> -        i += 8;
> -    }
> -
> -    if (N - i >= 4)
> -    {
> -        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
> -        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
> -
> -        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
> -
> -        acc1 = _mm_add_epi32(acc1, inVec1_3210);
> -        i += 4;
> -    }
> -
> -    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
> -    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
> -
> -    sum += _mm_cvtsi128_si32(acc1);
> -
> -    for (;i<N;i++)
> -    {
> -        sum = silk_SMLABB(sum, x[i], y[i]);
> -    }
> +#include <xmmintrin.h>
> +#include "arch.h"
>
> -    return sum;
> +void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32
sum[4], int len)
> +{
> +   int j;
> +   __m128 xsum1, xsum2;
> +   xsum1 = _mm_loadu_ps(sum);
> +   xsum2 = _mm_setzero_ps();
> +
> +   for (j = 0; j < len-3; j += 4)
> +   {
> +      __m128 x0 = _mm_loadu_ps(x+j);
> +      __m128 yj = _mm_loadu_ps(y+j);
> +      __m128 y3 = _mm_loadu_ps(y+j+3);
> +
> +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
> +      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
> +                                          _mm_shuffle_ps(yj,y3,0x49)));
> +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
> +                                          _mm_shuffle_ps(yj,y3,0x9e)));
> +      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
> +   }
> +   if (j < len)
> +   {
> +      xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
> +      if (++j < len)
> +      {
> +         xsum2 =
_mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
> +         if (++j < len)
> +         {
> +            xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
> +         }
> +      }
> +   }
> +   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
>  }
>
> -void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y,
opus_val32 sum[ 4 ], int len)
> +
> +void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const
opus_val16 *y02,
> +      int N, opus_val32 *xy1, opus_val32 *xy2)
>  {
> -    int j;
> -
> -    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
> -    __m128i vecY0, vecY1, vecY2, vecY3;
> -    __m128i sum0, sum1, sum2, sum3, vecSum;
> -    __m128i initSum;
> -
> -    celt_assert(len >= 3);
> -
> -    sum0 = _mm_setzero_si128();
> -    sum1 = _mm_setzero_si128();
> -    sum2 = _mm_setzero_si128();
> -    sum3 = _mm_setzero_si128();
> -
> -    for (j=0;j<(len-7);j+=8)
> -    {
> -        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
> -        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
> -        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
> -        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
> -        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
> -
> -        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
> -        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
> -        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
> -        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
> -    }
> -
> -    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
> -    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
> -
> -    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
> -    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
> -
> -    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
> -    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
> -
> -    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
> -    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
> -
> -    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
> -          _mm_unpacklo_epi32(sum2, sum3));
> -
> -    for (;j<(len-3);j+=4)
> -    {
> -        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
> -        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
> -        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
> -        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
> -        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
> -
> -        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
> -        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
> -        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
> -        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
> -
> -        sum0 = _mm_mullo_epi32(vecX0, vecY0);
> -        sum1 = _mm_mullo_epi32(vecX1, vecY1);
> -        sum2 = _mm_mullo_epi32(vecX2, vecY2);
> -        sum3 = _mm_mullo_epi32(vecX3, vecY3);
> -
> -        sum0 = _mm_add_epi32(sum0, sum1);
> -        sum2 = _mm_add_epi32(sum2, sum3);
> -        vecSum = _mm_add_epi32(vecSum, sum0);
> -        vecSum = _mm_add_epi32(vecSum, sum2);
> -    }
> -
> -    for (;j<len;j++)
> -    {
> -        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
> -        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
> -
> -        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
> -
> -        sum0 = _mm_mullo_epi32(vecX0, vecY0);
> -        vecSum = _mm_add_epi32(vecSum, sum0);
> -    }
> -
> -    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
> -    initSum = _mm_add_epi32(initSum, vecSum);
> -    _mm_storeu_si128((__m128i *)sum, initSum);
> +   int i;
> +   __m128 xsum1, xsum2;
> +   xsum1 = _mm_setzero_ps();
> +   xsum2 = _mm_setzero_ps();
> +   for (i=0;i<N-3;i+=4)
> +   {
> +      __m128 xi = _mm_loadu_ps(x+i);
> +      __m128 y1i = _mm_loadu_ps(y01+i);
> +      __m128 y2i = _mm_loadu_ps(y02+i);
> +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
> +      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
> +   }
> +   /* Horizontal sum */
> +   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
> +   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
> +   _mm_store_ss(xy1, xsum1);
> +   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
> +   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
> +   _mm_store_ss(xy2, xsum2);
> +   for (;i<N;i++)
> +   {
> +      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
> +      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
> +   }
>  }
> -#endif
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE2)
> -opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
> +opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
>        int N)
>  {
> -    opus_int  i, dataSize16;
> -    opus_int32 sum;
> -
> -    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
> -    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
> -
> -    sum = 0;
> -    dataSize16 = N & ~15;
> -
> -    acc1 = _mm_setzero_si128();
> -    acc2 = _mm_setzero_si128();
> -
> -    for (i=0;i<dataSize16;i+=16)
> -    {
> -        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> -        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> -
> -        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
> -        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
> -
> -        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> -        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98,
inVec2_FEDCBA98);
> -
> -        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> -        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
> -    }
> -
> -    acc1 = _mm_add_epi32( acc1, acc2 );
> -
> -    if (N - i >= 8)
> -    {
> -        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> -        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> -
> -        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> +   int i;
> +   float xy;
> +   __m128 sum;
> +   sum = _mm_setzero_ps();
> +   /* FIXME: We should probably go 8-way and use 2 sums. */
> +   for (i=0;i<N-3;i+=4)
> +   {
> +      __m128 xi = _mm_loadu_ps(x+i);
> +      __m128 yi = _mm_loadu_ps(y+i);
> +      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
> +   }
> +   /* Horizontal sum */
> +   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
> +   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
> +   _mm_store_ss(&xy, sum);
> +   for (;i<N;i++)
> +   {
> +      xy = MAC16_16(xy, x[i], y[i]);
> +   }
> +   return xy;
> +}
>
> -        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> -        i += 8;
> -    }
> +void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
> +      opus_val16 g10, opus_val16 g11, opus_val16 g12)
> +{
> +   int i;
> +   __m128 x0v;
> +   __m128 g10v, g11v, g12v;
> +   g10v = _mm_load1_ps(&g10);
> +   g11v = _mm_load1_ps(&g11);
> +   g12v = _mm_load1_ps(&g12);
> +   x0v = _mm_loadu_ps(&x[-T-2]);
> +   for (i=0;i<N-3;i+=4)
> +   {
> +      __m128 yi, yi2, x1v, x2v, x3v, x4v;
> +      const opus_val32 *xp = &x[i-T-2];
> +      yi = _mm_loadu_ps(x+i);
> +      x4v = _mm_loadu_ps(xp+4);
> +#if 0
> +      /* Slower version with all loads */
> +      x1v = _mm_loadu_ps(xp+1);
> +      x2v = _mm_loadu_ps(xp+2);
> +      x3v = _mm_loadu_ps(xp+3);
> +#else
> +      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
> +      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
> +      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
> +#endif
>
> -    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
> -    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
> -    sum += _mm_cvtsi128_si32(acc1);
> +      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
> +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
> +      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
> +      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
> +#else
> +      /* Use partial sums */
> +      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
> +                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
> +      yi = _mm_add_ps(yi, yi2);
> +#endif
> +      x0v=x4v;
> +      _mm_storeu_ps(y+i, yi);
> +   }
> +#ifdef CUSTOM_MODES
> +   for (;i<N;i++)
> +   {
> +      y[i] = x[i]
> +               + MULT16_32_Q15(g10,x[i-T])
> +               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
> +               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
> +   }
> +#endif
> +}
>
> -    for (;i<N;i++) {
> -        sum = silk_SMLABB(sum, x[i], y[i]);
> -    }
>
> -    return sum;
> -}
>  #endif
> diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
> index 99d1919..cbe722c 100644
> --- a/celt/x86/pitch_sse.h
> +++ b/celt/x86/pitch_sse.h
> @@ -37,17 +37,37 @@
>  #include "config.h"
>  #endif
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
>  void xcorr_kernel_sse4_1(
>                      const opus_int16 *x,
>                      const opus_int16 *y,
>                      opus_val32       sum[4],
>                      int              len);
> +#endif
> +
> +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
> +void xcorr_kernel_sse(
> +                    const opus_val16 *x,
> +                    const opus_val16 *y,
> +                    opus_val32       sum[4],
> +                    int              len);
> +#endif
> +
> +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
> +#define OVERRIDE_XCORR_KERNEL
> +#define xcorr_kernel(x, y, sum, len, arch) \
> +    ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
> +
> +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
> +#define OVERRIDE_XCORR_KERNEL
> +#define xcorr_kernel(x, y, sum, len, arch) \
> +    ((void)arch, xcorr_kernel_sse(x, y, sum, len))
> +
> +#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT))
|| (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
>
>  extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
> -                    const opus_int16 *x,
> -                    const opus_int16 *y,
> +                    const opus_val16 *x,
> +                    const opus_val16 *y,
>                      opus_val32       sum[4],
>                      int              len);
>
> @@ -55,181 +75,115 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK +
1])(
>  #define xcorr_kernel(x, y, sum, len, arch) \
>      ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
>
> +#endif
> +
> +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
>  opus_val32 celt_inner_prod_sse4_1(
>      const opus_int16 *x,
>      const opus_int16 *y,
>      int               N);
>  #endif
>
> -#if defined(OPUS_X86_MAY_HAVE_SSE2)
> +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
>  opus_val32 celt_inner_prod_sse2(
>      const opus_int16 *x,
>      const opus_int16 *y,
>      int               N);
>  #endif
>
> +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
> +opus_val32 celt_inner_prod_sse(
> +    const opus_val16 *x,
> +    const opus_val16 *y,
> +    int               N);
> +#endif
> +
> +
> +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
> +#define OVERRIDE_CELT_INNER_PROD
> +#define celt_inner_prod(x, y, N, arch) \
> +       ((void)arch, celt_inner_prod_sse4_1(x, y, N))
> +
> +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT)
&& !defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#define OVERRIDE_CELT_INNER_PROD
> +#define celt_inner_prod(x, y, N, arch) \
> +       ((void)arch, celt_inner_prod_sse2(x, y, N))
> +
> +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
> +#define OVERRIDE_CELT_INNER_PROD
> +#define celt_inner_prod(x, y, N, arch) \
> +       ((void)arch, celt_inner_prod_sse(x, y, N))
> +
> +
> +#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) ||
defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
> +       (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
> +
>  extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
> -                    const opus_int16 *x,
> -                    const opus_int16 *y,
> +                    const opus_val16 *x,
> +                    const opus_val16 *y,
>                      int               N);
>
>  #define OVERRIDE_CELT_INNER_PROD
>  #define celt_inner_prod(x, y, N, arch) \
>      ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
> -#else
>
> -#include <xmmintrin.h>
> -#include "arch.h"
> +#endif
>
> -#define OVERRIDE_XCORR_KERNEL
> -static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const
opus_val16 *y, opus_val32 sum[4], int len)
> -{
> -   int j;
> -   __m128 xsum1, xsum2;
> -   xsum1 = _mm_loadu_ps(sum);
> -   xsum2 = _mm_setzero_ps();
> -
> -   for (j = 0; j < len-3; j += 4)
> -   {
> -      __m128 x0 = _mm_loadu_ps(x+j);
> -      __m128 yj = _mm_loadu_ps(y+j);
> -      __m128 y3 = _mm_loadu_ps(y+j+3);
> -
> -      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
> -      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
> -                                          _mm_shuffle_ps(yj,y3,0x49)));
> -      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
> -                                          _mm_shuffle_ps(yj,y3,0x9e)));
> -      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
> -   }
> -   if (j < len)
> -   {
> -      xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
> -      if (++j < len)
> -      {
> -         xsum2 =
_mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
> -         if (++j < len)
> -         {
> -            xsum1 =
_mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
> -         }
> -      }
> -   }
> -   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
> -}
> -
> -#define xcorr_kernel(_x, _y, _z, len, arch) \
> -    ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
> +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
>
>  #define OVERRIDE_DUAL_INNER_PROD
> -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const
opus_val16 *y01, const opus_val16 *y02,
> -      int N, opus_val32 *xy1, opus_val32 *xy2)
> -{
> -   int i;
> -   __m128 xsum1, xsum2;
> -   xsum1 = _mm_setzero_ps();
> -   xsum2 = _mm_setzero_ps();
> -   for (i=0;i<N-3;i+=4)
> -   {
> -      __m128 xi = _mm_loadu_ps(x+i);
> -      __m128 y1i = _mm_loadu_ps(y01+i);
> -      __m128 y2i = _mm_loadu_ps(y02+i);
> -      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
> -      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
> -   }
> -   /* Horizontal sum */
> -   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
> -   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
> -   _mm_store_ss(xy1, xsum1);
> -   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
> -   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
> -   _mm_store_ss(xy2, xsum2);
> -   for (;i<N;i++)
> -   {
> -      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
> -      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
> -   }
> -}
> +#define OVERRIDE_COMB_FILTER_CONST
>
> -#define OVERRIDE_CELT_INNER_PROD
> -static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x,
const opus_val16 *y,
> -      int N)
> -{
> -   int i;
> -   float xy;
> -   __m128 sum;
> -   sum = _mm_setzero_ps();
> -   /* FIXME: We should probably go 8-way and use 2 sums. */
> -   for (i=0;i<N-3;i+=4)
> -   {
> -      __m128 xi = _mm_loadu_ps(x+i);
> -      __m128 yi = _mm_loadu_ps(y+i);
> -      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
> -   }
> -   /* Horizontal sum */
> -   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
> -   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
> -   _mm_store_ss(&xy, sum);
> -   for (;i<N;i++)
> -   {
> -      xy = MAC16_16(xy, x[i], y[i]);
> -   }
> -   return xy;
> -}
> -
> -#  define celt_inner_prod(_x, _y, len, arch) \
> -    ((void)(arch),celt_inner_prod_sse(_x, _y, len))
> +void dual_inner_prod_sse(const opus_val16 *x,
> +       const opus_val16 *y01,
> +       const opus_val16 *y02,
> +       int               N,
> +       opus_val32       *xy1,
> +       opus_val32       *xy2);
> +
> +void comb_filter_const_sse(opus_val32 *y,
> +       opus_val32 *x,
> +       int         T,
> +       int         N,
> +       opus_val16  g10,
> +       opus_val16  g11,
> +       opus_val16  g12);
> +
> +
> +#if defined(OPUS_X86_PRESUME_SSE)
> +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
> +    ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
>
>  #define OVERRIDE_COMB_FILTER_CONST
> -static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x,
int T, int N,
> -      opus_val16 g10, opus_val16 g11, opus_val16 g12)
> -{
> -   int i;
> -   __m128 x0v;
> -   __m128 g10v, g11v, g12v;
> -   g10v = _mm_load1_ps(&g10);
> -   g11v = _mm_load1_ps(&g11);
> -   g12v = _mm_load1_ps(&g12);
> -   x0v = _mm_loadu_ps(&x[-T-2]);
> -   for (i=0;i<N-3;i+=4)
> -   {
> -      __m128 yi, yi2, x1v, x2v, x3v, x4v;
> -      const opus_val32 *xp = &x[i-T-2];
> -      yi = _mm_loadu_ps(x+i);
> -      x4v = _mm_loadu_ps(xp+4);
> -#if 0
> -      /* Slower version with all loads */
> -      x1v = _mm_loadu_ps(xp+1);
> -      x2v = _mm_loadu_ps(xp+2);
> -      x3v = _mm_loadu_ps(xp+3);
> -#else
> -      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
> -      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
> -      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
> -#endif
>
> -      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
> -#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
> -      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
> -      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
>  #else
> -      /* Use partial sums */
> -      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
> -                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
> -      yi = _mm_add_ps(yi, yi2);
> +
> +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
> +              const opus_val16 *x,
> +              const opus_val16 *y01,
> +              const opus_val16 *y02,
> +              int               N,
> +              opus_val32       *xy1,
> +              opus_val32       *xy2);
> +
> +#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch)                   
\
> +    ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N,
xy1, xy2))
> +
> +extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
> +              opus_val32 *y,
> +              opus_val32 *x,
> +              int         T,
> +              int         N,
> +              opus_val16  g10,
> +              opus_val16  g11,
> +              opus_val16  g12);
> +
> +#define comb_filter_const(y, x, T, N, g10, g11, g12, arch)                
\
> +    ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N,
g10, g11, g12))
> +
> +#define NON_STATIC_COMB_FILTER_CONST_C
> +
>  #endif
> -      x0v=x4v;
> -      _mm_storeu_ps(y+i, yi);
> -   }
> -#ifdef CUSTOM_MODES
> -   for (;i<N;i++)
> -   {
> -      y[i] = x[i]
> -               + MULT16_32_Q15(g10,x[i-T])
> -               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
> -               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
> -   }
>  #endif
> -}
>
>  #endif
> -#endif
> diff --git a/celt/x86/pitch_sse2.c b/celt/x86/pitch_sse2.c
> new file mode 100644
> index 0000000..a0e7d1b
> --- /dev/null
> +++ b/celt/x86/pitch_sse2.c
> @@ -0,0 +1,95 @@
> +/* Copyright (c) 2014, Cisco Systems, INC
> +   Written by XiangMingZhu WeiZhou MinPeng YanWang
> +
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
> +
> +#include <xmmintrin.h>
> +#include <emmintrin.h>
> +
> +#include "macros.h"
> +#include "celt_lpc.h"
> +#include "stack_alloc.h"
> +#include "mathops.h"
> +#include "pitch.h"
> +
> +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
> +opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
> +      int N)
> +{
> +    opus_int  i, dataSize16;
> +    opus_int32 sum;
> +
> +    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
> +    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
> +
> +    sum = 0;
> +    dataSize16 = N & ~15;
> +
> +    acc1 = _mm_setzero_si128();
> +    acc2 = _mm_setzero_si128();
> +
> +    for (i=0;i<dataSize16;i+=16)
> +    {
> +        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> +        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> +
> +        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
> +        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
> +
> +        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> +        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98,
inVec2_FEDCBA98);
> +
> +        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> +        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
> +    }
> +
> +    acc1 = _mm_add_epi32( acc1, acc2 );
> +
> +    if (N - i >= 8)
> +    {
> +        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> +        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> +
> +        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> +
> +        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> +        i += 8;
> +    }
> +
> +    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
> +    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
> +    sum += _mm_cvtsi128_si32(acc1);
> +
> +    for (;i<N;i++) {
> +        sum = silk_SMLABB(sum, x[i], y[i]);
> +    }
> +
> +    return sum;
> +}
> +#endif
> diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
> new file mode 100644
> index 0000000..a092c68
> --- /dev/null
> +++ b/celt/x86/pitch_sse4_1.c
> @@ -0,0 +1,195 @@
> +/* Copyright (c) 2014, Cisco Systems, INC
> +   Written by XiangMingZhu WeiZhou MinPeng YanWang
> +
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
> +
> +#include <xmmintrin.h>
> +#include <emmintrin.h>
> +
> +#include "macros.h"
> +#include "celt_lpc.h"
> +#include "stack_alloc.h"
> +#include "mathops.h"
> +#include "pitch.h"
> +
> +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
> +#include <smmintrin.h>
> +#include "x86cpu.h"
> +
> +opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16
*y,
> +      int N)
> +{
> +    opus_int  i, dataSize16;
> +    opus_int32 sum;
> +    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
> +    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
> +    __m128i inVec1_3210, inVec2_3210;
> +
> +    sum = 0;
> +    dataSize16 = N & ~15;
> +
> +    acc1 = _mm_setzero_si128();
> +    acc2 = _mm_setzero_si128();
> +
> +    for (i=0;i<dataSize16;i+=16) {
> +        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> +        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> +
> +        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
> +        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
> +
> +        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> +        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98,
inVec2_FEDCBA98);
> +
> +        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> +        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
> +    }
> +
> +    acc1 = _mm_add_epi32(acc1, acc2);
> +
> +    if (N - i >= 8)
> +    {
> +        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
> +        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
> +
> +        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210,
inVec2_76543210);
> +
> +        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
> +        i += 8;
> +    }
> +
> +    if (N - i >= 4)
> +    {
> +        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
> +        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
> +
> +        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
> +
> +        acc1 = _mm_add_epi32(acc1, inVec1_3210);
> +        i += 4;
> +    }
> +
> +    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
> +    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
> +
> +    sum += _mm_cvtsi128_si32(acc1);
> +
> +    for (;i<N;i++)
> +    {
> +        sum = silk_SMLABB(sum, x[i], y[i]);
> +    }
> +
> +    return sum;
> +}
> +
> +void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y,
opus_val32 sum[ 4 ], int len)
> +{
> +    int j;
> +
> +    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
> +    __m128i vecY0, vecY1, vecY2, vecY3;
> +    __m128i sum0, sum1, sum2, sum3, vecSum;
> +    __m128i initSum;
> +
> +    celt_assert(len >= 3);
> +
> +    sum0 = _mm_setzero_si128();
> +    sum1 = _mm_setzero_si128();
> +    sum2 = _mm_setzero_si128();
> +    sum3 = _mm_setzero_si128();
> +
> +    for (j=0;j<(len-7);j+=8)
> +    {
> +        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
> +        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
> +        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
> +        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
> +        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
> +
> +        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
> +        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
> +        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
> +        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
> +    }
> +
> +    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
> +    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
> +
> +    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
> +    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
> +
> +    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
> +    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
> +
> +    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
> +    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
> +
> +    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
> +          _mm_unpacklo_epi32(sum2, sum3));
> +
> +    for (;j<(len-3);j+=4)
> +    {
> +        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
> +        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
> +        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
> +        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
> +        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
> +
> +        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
> +        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
> +        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
> +        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
> +
> +        sum0 = _mm_mullo_epi32(vecX0, vecY0);
> +        sum1 = _mm_mullo_epi32(vecX1, vecY1);
> +        sum2 = _mm_mullo_epi32(vecX2, vecY2);
> +        sum3 = _mm_mullo_epi32(vecX3, vecY3);
> +
> +        sum0 = _mm_add_epi32(sum0, sum1);
> +        sum2 = _mm_add_epi32(sum2, sum3);
> +        vecSum = _mm_add_epi32(vecSum, sum0);
> +        vecSum = _mm_add_epi32(vecSum, sum2);
> +    }
> +
> +    for (;j<len;j++)
> +    {
> +        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
> +        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
> +
> +        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
> +
> +        sum0 = _mm_mullo_epi32(vecX0, vecY0);
> +        vecSum = _mm_add_epi32(vecSum, sum0);
> +    }
> +
> +    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
> +    initSum = _mm_add_epi32(initSum, vecSum);
> +    _mm_storeu_si128((__m128i *)sum, initSum);
> +}
> +#endif
> diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
> index 83410db..1ed2acb 100644
> --- a/celt/x86/x86_celt_map.c
> +++ b/celt/x86/x86_celt_map.c
> @@ -38,6 +38,8 @@
>
>  # if defined(FIXED_POINT)
>
> +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1)
> +
>  void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
>           const opus_val16 *x,
>           const opus_val16 *num,
> @@ -49,8 +51,8 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
>  ) = {
>    celt_fir_c,                /* non-sse */
>    celt_fir_c,
> +  celt_fir_c,
>    MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
> -  NULL
>  };
>
>  void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
> @@ -61,24 +63,86 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
>  ) = {
>    xcorr_kernel_c,                /* non-sse */
>    xcorr_kernel_c,
> +  xcorr_kernel_c,
>    MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
> -  NULL
>  };
>
> +#endif
> +
> +#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1)) ||  \
> +       (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) &&
defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
> +
>  opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
>           const opus_val16 *x,
>           const opus_val16 *y,
>           int              N
>  ) = {
>    celt_inner_prod_c,                /* non-sse */
> +  celt_inner_prod_c,
>    MAY_HAVE_SSE2(celt_inner_prod),
>    MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
> -  NULL
>  };
>
> +#endif
> +
>  # else
> -#  error "Floating-point implementation is not supported by x86 RTCD
yet." \
> - "Reconfigure with --disable-rtcd or send patches."
> -# endif
>
> +#if defined(OPUS_X86_MAY_HAVE_SSE) &&
!defined(OPUS_X86_PRESUME_SSE)
> +
> +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
> +         const opus_val16 *x,
> +         const opus_val16 *y,
> +         opus_val32       sum[4],
> +         int              len
> +) = {
> +  xcorr_kernel_c,                /* non-sse */
> +  MAY_HAVE_SSE(xcorr_kernel),
> +  MAY_HAVE_SSE(xcorr_kernel),
> +  MAY_HAVE_SSE(xcorr_kernel),
> +};
> +
> +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
> +         const opus_val16 *x,
> +         const opus_val16 *y,
> +         int              N
> +) = {
> +  celt_inner_prod_c,                /* non-sse */
> +  MAY_HAVE_SSE(celt_inner_prod),
> +  MAY_HAVE_SSE(celt_inner_prod),
> +  MAY_HAVE_SSE(celt_inner_prod),
> +};
> +
> +void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
> +                    const opus_val16 *x,
> +                    const opus_val16 *y01,
> +                    const opus_val16 *y02,
> +                    int               N,
> +                    opus_val32       *xy1,
> +                    opus_val32       *xy2
> +) = {
> +  dual_inner_prod_c,                /* non-sse */
> +  MAY_HAVE_SSE(dual_inner_prod),
> +  MAY_HAVE_SSE(dual_inner_prod),
> +  MAY_HAVE_SSE(dual_inner_prod),
> +};
> +
> +void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
> +              opus_val32 *y,
> +              opus_val32 *x,
> +              int         T,
> +              int         N,
> +              opus_val16  g10,
> +              opus_val16  g11,
> +              opus_val16  g12
> +) = {
> +  comb_filter_const_c,                /* non-sse */
> +  MAY_HAVE_SSE(comb_filter_const),
> +  MAY_HAVE_SSE(comb_filter_const),
> +  MAY_HAVE_SSE(comb_filter_const),
> +};
> +
> +
> +#endif
> +
> +#endif
>  #endif
> diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
> index c82a4b7..afcdeb6 100644
> --- a/celt/x86/x86cpu.c
> +++ b/celt/x86/x86cpu.c
> @@ -35,10 +35,19 @@
>  #include "pitch.h"
>  #include "x86cpu.h"
>
> +#if (defined(OPUS_X86_MAY_HAVE_SSE) &&
!defined(OPUS_X86_PRESUME_SSE)) || \
> +  (defined(OPUS_X86_MAY_HAVE_SSE2) &&
!defined(OPUS_X86_PRESUME_SSE2)) || \
> +  (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1))
> +
> +
>  #if defined(_MSC_VER)
>
>  #include <intrin.h>
> -#define cpuid(info,x) __cpuid(info,x)
> +static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
> +{
> +       __cpuid((int*)CPUInfo, InfoType);
> +}
> +
>  #else
>
>  #if defined(CPU_INFO_BY_C)
> @@ -48,14 +57,28 @@
>  static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
>  {
>  #if defined(CPU_INFO_BY_ASM)
> +#if defined(__i386__) && defined(__PIC__)
> +/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
> +    __asm__ __volatile__ (
> +       "xchg %%ebx, %1\n"
> +       "cpuid\n"
> +       "xchg %%ebx, %1\n":
> +        "=a" (CPUInfo[0]),
> +        "=r" (CPUInfo[1]),
> +        "=c" (CPUInfo[2]),
> +        "=d" (CPUInfo[3]) :
> +        "0" (InfoType)
> +    );
> +#else
>      __asm__ __volatile__ (
>          "cpuid":
>          "=a" (CPUInfo[0]),
>          "=b" (CPUInfo[1]),
>          "=c" (CPUInfo[2]),
>          "=d" (CPUInfo[3]) :
> -        "a" (InfoType), "c" (0)
> +        "0" (InfoType)
>      );
> +#endif
>  #elif defined(CPU_INFO_BY_C)
>      __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]),
&(CPUInfo[2]), &(CPUInfo[3]));
>  #endif
> @@ -63,11 +86,9 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int
InfoType)
>
>  #endif
>
> -#include "SigProc_FIX.h"
> -#include "celt_lpc.h"
> -
>  typedef struct CPU_Feature{
>      /*  SIMD: 128-bit */
> +    int HW_SSE;
>      int HW_SSE2;
>      int HW_SSE41;
>  } CPU_Feature;
> @@ -82,19 +103,31 @@ static void opus_cpu_feature_check(CPU_Feature
*cpu_feature)
>
>      if (nIds >= 1){
>          cpuid(info, 1);
> +        cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
>          cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
>          cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
>      }
> +    else {
> +        cpu_feature->HW_SSE = 0;
> +        cpu_feature->HW_SSE2 = 0;
> +        cpu_feature->HW_SSE41 = 0;
> +    }
>  }
>
>  int opus_select_arch(void)
>  {
> -    CPU_Feature cpu_feature = {0};
> +    CPU_Feature cpu_feature;
>      int arch;
>
>      opus_cpu_feature_check(&cpu_feature);
>
>      arch = 0;
> +    if (!cpu_feature.HW_SSE)
> +    {
> +       return arch;
> +    }
> +    arch++;
> +
>      if (!cpu_feature.HW_SSE2)
>      {
>         return arch;
> @@ -109,3 +142,5 @@ int opus_select_arch(void)
>
>      return arch;
>  }
> +
> +#endif
> diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
> index ef53f0c..7f4c61d 100644
> --- a/celt/x86/x86cpu.h
> +++ b/celt/x86/x86cpu.h
> @@ -28,6 +28,12 @@
>  #if !defined(X86CPU_H)
>  # define X86CPU_H
>
> +# if defined(OPUS_X86_MAY_HAVE_SSE)
> +#  define MAY_HAVE_SSE(name) name ## _sse
> +# else
> +#  define MAY_HAVE_SSE(name) name ## _c
> +# endif
> +
>  # if defined(OPUS_X86_MAY_HAVE_SSE2)
>  #  define MAY_HAVE_SSE2(name) name ## _sse2
>  # else
> @@ -55,21 +61,25 @@ int opus_select_arch(void);
>    reference in the PMOVSXWD instruction itself, but gcc is not smart
enough to
>    optimize this out when optimizations ARE enabled.
>
> -  It appears clang requires us to do this always (which is fair, since
> -  technically the compiler is always allowed to do the dereference before
> -  invoking the function implementing the intrinsic). I have not
investiaged
> -  whether it is any smarter than gcc when it comes to eliminating the
extra
> -  load instruction.*/
> +  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
> +  (which is fair, since technically the compiler is always allowed to do
the
> +  dereference before invoking the function implementing the intrinsic).
> +  However, it is smart enough to eliminate the extra MOVD instruction.
> +  For _mm_cvtepi16_epi32, it does the right thing, though does *not*
optimize out
> +  the extra MOVQ if it's specified explicitly */
> +
>  # if defined(__clang__) || !defined(__OPTIMIZE__)
>  #  define OP_CVTEPI8_EPI32_M32(x) \
>   (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
> -
> -#  define OP_CVTEPI16_EPI32_M64(x) \
> - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
>  # else
>  #  define OP_CVTEPI8_EPI32_M32(x) \
>   (_mm_cvtepi8_epi32(*(__m128i *)(x)))
> +#endif
>
> +# if !defined(__OPTIMIZE__)
> +#  define OP_CVTEPI16_EPI32_M64(x) \
> + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
> +# else
>  #  define OP_CVTEPI16_EPI32_M64(x) \
>   (_mm_cvtepi16_epi32(*(__m128i *)(x)))
>  # endif
> diff --git a/celt_sources.mk b/celt_sources.mk
> index 7121301..2ffe99a 100644
> --- a/celt_sources.mk
> +++ b/celt_sources.mk
> @@ -21,7 +21,10 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \
>  celt/x86/x86_celt_map.c \
>  celt/x86/pitch_sse.c
>
> -CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c
> +CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c
> +
> +CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \
> +celt/x86/pitch_sse4_1.c
>
>  CELT_SOURCES_ARM = \
>  celt/arm/armcpu.c \
> diff --git a/configure.ac b/configure.ac
> index baa3425..9b05fc1 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -348,8 +348,24 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
>  AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
>      [test x"${asm_optimization%% *}" = x"ARM"])
>
> -AM_CONDITIONAL([HAVE_SSE4_1], [false])
> +AM_CONDITIONAL([HAVE_SSE], [false])
>  AM_CONDITIONAL([HAVE_SSE2], [false])
> +AM_CONDITIONAL([HAVE_SSE4_1], [false])
> +
> +m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse])
> +m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2])
> +m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1])
> +m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon])
> +
> +AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics
@<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@])
> +AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics
@<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@])
> +AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1
intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@])
> +AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON
intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS[@:>@])
> +
> +AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS],
DEFAULT_X86_SSE_CFLAGS)])
> +AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS],
DEFAULT_X86_SSE2_CFLAGS)])
> +AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS],
DEFAULT_X86_SSE4_1_CFLAGS)])
> +AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [],
[AS_VAR_SET([ARM_NEON_INTR_CFLAGS], DEFAULT_ARM_NEON_INTR_CFLAGS)])
>
>  AC_DEFUN([OPUS_PATH_NE10],
>     [
> @@ -426,45 +442,53 @@ AC_DEFUN([OPUS_PATH_NE10],
>  )
>
>  AS_IF([test x"$enable_intrinsics" = x"yes"],[
> -   case $host_cpu in
> -   arm*)
> +   intrinsics_support=""
> +   AS_CASE([$host_cpu],
> +   [arm*],
> +   [
>        cpu_arm=yes
> -      AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics)
> -      save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon
$CFLAGS"
> -      AC_LINK_IFELSE(
> -         [
> -            AC_LANG_PROGRAM(
> -               [[#include <arm_neon.h>
> -               ]],
> -               [[
> -                  static float32x4_t A[2], SUMM;
> -                  SUMM = vmlaq_f32(SUMM, A[0], A[1]);
> -               ]]
> -            )
> -         ],[
> -            OPUS_ARM_NEON_INTR=1
> -            AC_MSG_RESULT([yes])
> -         ],[
> -            OPUS_ARM_NEON_INTR=0
> -            AC_MSG_RESULT([no])
> -         ]
> +      OPUS_CHECK_INTRINSICS(
> +         [ARM Neon],
> +         [$ARM_NEON_INTR_CFLAGS],
> +         [OPUS_ARM_MAY_HAVE_NEON_INTR],
> +         [OPUS_ARM_PRESUME_NEON_INTR],
> +         [[#include <arm_neon.h>
> +         ]],
> +         [[
> +            static float32x4_t A0, A1, SUMM;
> +            SUMM = vmlaq_f32(SUMM, A0, A1);
> +         ]]
> +      )
> +      AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" =
x"1" && test x"$OPUS_ARM_PRESUME_NEON_INTR" !=
x"1"],
> +          [
> +             OPUS_ARM_NEON_INTR_CFLAGS="$ARM_NEON_INTR_CFLAGS"
> +             AC_SUBST([OPUS_ARM_NEON_INTR_CFLAGS])
> +          ]
>        )
> -      CFLAGS="$save_CFLAGS"
> -      #Now we know if compiler supports ARM neon intrinsics or not
>
> -      #Currently we only have intrinsic optimization for floating point
> +      #Currently we only have intrinsic optimizations for floating point
>        AS_IF([test x"$enable_float" = x"yes"],
>        [
> -         AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"],
> +         AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" =
x"1"],
>           [
> -            AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7
Neon Intrinsics])
> -            AS_IF([test x"enable_rtcd" != x""],
> -               [rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[])
> -            enable_intrinsics="$enable_intrinsics
ARMv7_Neon_Intrinsics"
> -            dnl Don't see why defining these is necessary to check
features at runtime
> -            AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler
support EDSP Instructions])
> -            AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler
support MEDIA Instructions])
> -            AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler
support NEON instructions])
> +            AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1,
> +                      [Compiler supports ARMv7 Neon Intrinsics])
> +            intrinsics_support="$intrinsics_support
(Neon_Intrinsics)"
> +
> +            AS_IF([test x"enable_rtcd" != x""
&& test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"],
> +                  [rtcd_support="$rtcd_support
(ARMv7_Neon_Intrinsics)"],[])
> +
> +            AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" =
x"1"],
> +                  [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1,
> +                             [Define if binary requires NEON intrinsics
support])])
> +
> +                          AS_IF([test x"$rtcd_support" =
x""],
> +                  [rtcd_support=no])
> +
> +            AS_IF([test x"$intrinsics_support" = x""],
> +                  [intrinsics_support=no],
> +                               
[intrinsics_support="arm$intrinsics_support"])
> +
>
>              OPUS_PATH_NE10()
>              AS_IF([test x"$NE10_LIBS" != "x"],
> @@ -472,18 +496,122 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
>           ],
>           [
>              AC_MSG_WARN([Compiler does not support ARM intrinsics])
> -            enable_intrinsics=no
> +            intrinsics_support=no
>           ])
>        ], [
> -            AC_MSG_WARN([Currently on have ARM intrinsics for float])
> -            enable_intrinsics=no
> +            AC_MSG_WARN([Currently only have ARM intrinsics for float])
> +            intrinsics_support=no
>        ])
> -   ;;
> -   "i386" | "i686" | "x86_64")
> -    AS_IF([test x"$enable_float" = x"no"],[
> -    AS_IF([test x"$enable_rtcd" = x"yes"],[
> +   ],
> +   [i?86|x86_64],
> +   [
> +      OPUS_CHECK_INTRINSICS(
> +         [SSE],
> +         [$X86_SSE_CFLAGS],
> +         [OPUS_X86_MAY_HAVE_SSE],
> +         [OPUS_X86_PRESUME_SSE],
> +         [[#include <xmmintrin.h>
> +         ]],
> +         [[
> +             static __m128 mtest;
> +             mtest = _mm_setzero_ps();
> +         ]]
> +      )
> +      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"
&& test x"$OPUS_X86_PRESUME_SSE" != x"1"],
> +          [
> +             OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS"
> +             AC_SUBST([OPUS_X86_SSE_CFLAGS])
> +          ]
> +      )
> +      OPUS_CHECK_INTRINSICS(
> +         [SSE2],
> +         [$X86_SSE2_CFLAGS],
> +         [OPUS_X86_MAY_HAVE_SSE2],
> +         [OPUS_X86_PRESUME_SSE2],
> +         [[#include <emmintrin.h>
> +         ]],
> +         [[
> +             static __m128i mtest;
> +             mtest = _mm_setzero_si128();
> +         ]]
> +      )
> +      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"
&& test x"$OPUS_X86_PRESUME_SSE2" != x"1"],
> +          [
> +             OPUS_X86_SSE2_CFLAGS="$X86_SSE2_CFLAGS"
> +             AC_SUBST([OPUS_X86_SSE2_CFLAGS])
> +          ]
> +      )
> +      OPUS_CHECK_INTRINSICS(
> +         [SSE4.1],
> +         [$X86_SSE4_1_CFLAGS],
> +         [OPUS_X86_MAY_HAVE_SSE4_1],
> +         [OPUS_X86_PRESUME_SSE4_1],
> +         [[#include <smmintrin.h>
> +         ]],
> +         [[
> +            static __m128i mtest;
> +            mtest = _mm_setzero_si128();
> +            mtest = _mm_cmpeq_epi64(mtest, mtest);
> +         ]]
> +      )
> +      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"
&& test x"$OPUS_X86_PRESUME_SSE4_1" != x"1"],
> +          [
> +             OPUS_X86_SSE4_1_CFLAGS="$X86_SSE4_1_CFLAGS"
> +             AC_SUBST([OPUS_X86_SSE4_1_CFLAGS])
> +          ]
> +      )
> +
> +         AS_IF([test x"$rtcd_support" = x"no"],
[rtcd_support=""])
> +         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" =
x"1"],
> +         [
> +            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE], 1, [Compiler supports X86
SSE Intrinsics])
> +            intrinsics_support="$intrinsics_support SSE"
> +
> +            AS_IF([test x"$OPUS_X86_PRESUME_SSE" =
x"1"],
> +               [AC_DEFINE([OPUS_X86_PRESUME_SSE], 1, [Define if binary
requires SSE intrinsics support])],
> +               [rtcd_support="$rtcd_support SSE"])
> +         ],
> +         [
> +            AC_MSG_WARN([Compiler does not support SSE intrinsics])
> +         ])
> +
> +         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" =
x"1"],
> +         [
> +            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], 1, [Compiler supports X86
SSE2 Intrinsics])
> +            intrinsics_support="$intrinsics_support SSE2"
> +
> +            AS_IF([test x"$OPUS_X86_PRESUME_SSE2" =
x"1"],
> +               [AC_DEFINE([OPUS_X86_PRESUME_SSE2], 1, [Define if binary
requires SSE2 intrinsics support])],
> +               [rtcd_support="$rtcd_support SSE2"])
> +         ],
> +         [
> +            AC_MSG_WARN([Compiler does not support SSE2 intrinsics])
> +         ])
> +
> +         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" =
x"1"],
> +         [
> +            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], 1, [Compiler supports
X86 SSE4.1 Intrinsics])
> +            intrinsics_support="$intrinsics_support SSE4.1"
> +
> +            AS_IF([test x"$OPUS_X86_PRESUME_SSE4_1" =
x"1"],
> +               [AC_DEFINE([OPUS_X86_PRESUME_SSE4_1], 1, [Define if binary
requires SSE4.1 intrinsics support])],
> +               [rtcd_support="$rtcd_support SSE4.1"])
> +         ],
> +         [
> +            AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics])
> +         ])
> +         AS_IF([test x"$intrinsics_support" = x""],
> +            [intrinsics_support=no],
> +            [intrinsics_support="x86$intrinsics_support"]
> +         )
> +         AS_IF([test x"$rtcd_support" = x""],
> +            [rtcd_support=no],
> +            [rtcd_support="x86$rtcd_support"],
> +        )
> +
> +    AS_IF([test x"$enable_rtcd" = x"yes" &&
test x"$rtcd_support" != x""],[
>              get_cpuid_by_asm="no"
> -            AC_MSG_CHECKING([Get CPU Info])
> +            AC_MSG_CHECKING([How to get X86 CPU Info])
>              AC_LINK_IFELSE([AC_LANG_PROGRAM([[
>                   #include <stdio.h>
>              ]],[[
> @@ -493,7 +621,7 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
>                   unsigned int CPUInfo3;
>                   unsigned int InfoType;
>                   __asm__ __volatile__ (
> -                 "cpuid11":
> +                 "cpuid":
>                   "=a" (CPUInfo0),
>                   "=b" (CPUInfo1),
>                   "=c" (CPUInfo2),
> @@ -502,7 +630,8 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
>                  );
>              ]])],
>              [get_cpuid_by_asm="yes"
> -             AC_MSG_RESULT([Inline Assembly])],
> +             AC_MSG_RESULT([Inline Assembly])
> +                        AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by
asm method])],
>               [AC_LINK_IFELSE([AC_LANG_PROGRAM([[
>                   #include <cpuid.h>
>              ]],[[
> @@ -513,90 +642,31 @@ AS_IF([test x"$enable_intrinsics" =
x"yes"],[
>                   unsigned int InfoType;
>                   __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1,
&CPUInfo2, &CPUInfo3);
>              ]])],
> -            [AC_MSG_RESULT([C method])],
> -            [AC_MSG_ERROR([not support Get CPU Info, please disable
intrinsics ])])])
> -
> -       AC_MSG_CHECKING([sse4.1])
> -       TMP_CFLAGS="$CFLAGS"
> -       gcc -Q --help=target | grep "\-msse4.1 "
> -       AS_IF([test x"$?" = x"0"],[
> -            CFLAGS="$CFLAGS -msse4.1"
> -            AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't
find xmmintrin.h])])
> -            AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't
find emmintrin.h])])
> -            AC_CHECK_HEADER(smmintrin.h, [], [AC_MSG_ERROR([Couldn't
find smmintrin.h])],[
> -            #ifdef HAVE_XMMINSTRIN_H
> -                 #include <xmmintrin.h>
> -                 #endif
> -                 #ifdef HAVE_EMMINSTRIN_H
> -                 #include <emmintrin.h>
> -                 #endif
> -            ])
> -
> -            AC_LINK_IFELSE([AC_LANG_PROGRAM([[
> -                 #include <xmmintrin.h>
> -                 #include <emmintrin.h>
> -                 #include <smmintrin.h>
> -            ]],[[
> -                 __m128i mtest = _mm_setzero_si128();
> -                 mtest = _mm_cmpeq_epi64(mtest, mtest);
> -            ]])],
> -            [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker
failure for sse4.1, please disable intrinsics])])
> -
> -            CFLAGS="$TMP_CFLAGS"
> -            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], [1], [For x86 sse4.1
instrinsics optimizations])
> -            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2
instrinsics optimizations])
> -            rtcd_support="x86 sse4.1"
> -            AM_CONDITIONAL([HAVE_SSE4_1], [true])
> -            AM_CONDITIONAL([HAVE_SSE2], [true])
> -            AS_IF([test x"$get_cpuid_by_asm" =
x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm
method])],
> -            [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])])
> -             ],[ ##### Else case for AS_IF([test x"$?" =
x"0"])
> -               gcc -Q --help=target | grep "\-msse2 "
> -               AC_MSG_CHECKING([sse2])
> -               AS_IF([test x"$?" = x"0"],[
> -                   AC_MSG_RESULT([yes])
> -                   CFLAGS="$CFLAGS -msse2"
> -                   AC_CHECK_HEADER(xmmintrin.h, [],
[AC_MSG_ERROR([Couldn't find xmmintrin.h])])
> -                   AC_CHECK_HEADER(emmintrin.h, [],
[AC_MSG_ERROR([Couldn't find emmintrin.h])])
> -
> -                   AC_LINK_IFELSE([AC_LANG_PROGRAM([[
> -                        #include <xmmintrin.h>
> -                        #include <emmintrin.h>
> -                   ]],[[
> -                        __m128i mtest = _mm_setzero_si128();
> -                   ]])],
> -                   [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler &
linker failure for sse2, please disable intrinsics])])
> -
> -                  CFLAGS="$TMP_CFLAGS"
> -                  AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2
instrinsics optimize])
> -                  rtcd_support="x86 sse2"
> -                  AM_CONDITIONAL([HAVE_SSE2], [true])
> -                  AS_IF([test x"$get_cpuid_by_asm" =
x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm
method])],
> -                  [AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c
method])])
> -            ],[enable_intrinsics="no"]) #End of AS_IF([test
x"$?" = x"0"]
> -        ])
> -    ], [
> -        enable_intrinsics="no"
> -    ]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"]
> -],
> -[  ## Else case for AS_IF([test x"$enable_float" =
x"no"]
> -   AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for
fixed point])
> -   enable_intrinsics="no"
> -]) ## End of AS_IF([test x"$enable_float" = x"no"]
> -   ;;
> -   *)
> +            [AC_MSG_RESULT([C method])
> +                        AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c
method])],
> +            [AC_MSG_ERROR([no supported Get CPU Info method, please
disable intrinsics])])])])
> +   ],
> +   [
>        AC_MSG_WARN([No intrinsics support for your architecture])
> -      enable_intrinsics="no"
> -   ;;
> -   esac
> +      intrinsics_support="no"
> +   ])
> +],
> +[
> +   intrinsics_support="no"
>  ])
>
>  AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
>  AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
> -    [test x"$OPUS_ARM_NEON_INTR" = x"1"])
> +    [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
>  AM_CONDITIONAL([HAVE_ARM_NE10],
>      [test x"$HAVE_ARM_NE10" = x"1"])
>
> +AM_CONDITIONAL([HAVE_SSE],
> +    [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"])
> +AM_CONDITIONAL([HAVE_SSE2],
> +    [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"])
> +AM_CONDITIONAL([HAVE_SSE4_1],
> +    [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"])
>
>  AS_IF([test x"$enable_rtcd" = x"yes"],[
>      AS_IF([test x"$rtcd_support" != x"no"],[
> @@ -704,7 +774,7 @@ AC_MSG_NOTICE([
>        Fixed point debugging: ......... ${enable_fixed_point_debug}
>        Inline Assembly Optimizations: . ${inline_optimization}
>        External Assembly Optimizations: ${asm_optimization}
> -      Intrinsics Optimizations.......: ${enable_intrinsics}
> +      Intrinsics Optimizations.......: ${intrinsics_support}
>        Run-time CPU detection: ........ ${rtcd_support}
>        Custom modes: .................. ${enable_custom_modes}
>        Assertion checking: ............ ${enable_assertions}
> diff --git a/m4/opus-intrinsics.m4 b/m4/opus-intrinsics.m4
> new file mode 100644
> index 0000000..c74aecd
> --- /dev/null
> +++ b/m4/opus-intrinsics.m4
> @@ -0,0 +1,29 @@
> +dnl opus-intrinsics.m4
> +dnl macro for testing for support for compiler intrinsics, either by
default or with a compiler flag
> +
> +dnl OPUS_CHECK_INTRINSICS(NAME-OF-INTRINSICS,
COMPILER-FLAG-FOR-INTRINSICS, VAR-IF-PRESENT, VAR-IF-DEFAULT,
TEST-PROGRAM-HEADER, TEST-PROGRAM-BODY)
> +AC_DEFUN([OPUS_CHECK_INTRINSICS],
> +[
> +   AC_MSG_CHECKING([if compiler supports $1 intrinsics])
> +   AC_LINK_IFELSE(
> +     [AC_LANG_PROGRAM($5, $6)],
> +     [
> +        $3=1
> +       $4=1
> +        AC_MSG_RESULT([yes])
> +      ],[
> +        $4=0
> +        AC_MSG_RESULT([no])
> +        AC_MSG_CHECKING([if compiler supports $1 intrinsics with $2])
> +        save_CFLAGS="$CFLAGS"; CFLAGS="$2 $CFLAGS"
> +        AC_LINK_IFELSE([AC_LANG_PROGRAM($5, $6)],
> +        [
> +           AC_MSG_RESULT([yes])
> +           $3=1
> +        ],[
> +           AC_MSG_RESULT([no])
> +           $3=0
> +        ])
> +        CFLAGS="$save_CFLAGS"
> +     ])
> +])
> diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
> index 9a0e096..61efa8d 100644
> --- a/silk/x86/SigProc_FIX_sse.h
> +++ b/silk/x86/SigProc_FIX_sse.h
> @@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1(
>      int                         arch                /* I    Run-time
architecture                                       */
>  );
>
> +#if defined(OPUS_X86_PRESUME_SSE4_1)
> +#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30,
subfr_length, nb_subfr, D, arch) \
> +    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x,
minInvGain_Q30, subfr_length, nb_subfr, D, arch))
> +
> +#else
> +
>  extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
>      opus_int32                  *res_nrg,           /* O    Residual
energy                                             */
>      opus_int                    *res_nrg_Q,         /* O    Residual
energy Q value                                     */
> @@ -59,12 +65,22 @@ extern void (*const
SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
>  #  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30,
subfr_length, nb_subfr, D, arch) \
>      ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg,
res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
>
> +#endif
> +
>  opus_int64 silk_inner_prod16_aligned_64_sse4_1(
>      const opus_int16 *inVec1,
>      const opus_int16 *inVec2,
>      const opus_int   len
>  );
>
> +
> +#if defined(OPUS_X86_PRESUME_SSE4_1)
> +
> +#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
> +    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2,
len))
> +
> +#else
> +
>  extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK
+ 1])(
>                      const opus_int16 *inVec1,
>                      const opus_int16 *inVec2,
> @@ -75,3 +91,4 @@ extern opus_int64 (*const
SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
>
>  #endif
>  #endif
> +#endif
> diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
> index f970632..afd5ec2 100644
> --- a/silk/x86/main_sse.h
> +++ b/silk/x86/main_sse.h
> @@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1(
>      opus_int                    L                               /* I   
number of vectors in codebook               */
>  );
>
> +#if defined OPUS_X86_PRESUME_SSE4_1
> +
> +#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7,
cb_gain_Q7, cl_Q5, \
> +                          mu_Q9, max_gain_Q7, L, arch) \
> +    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7,
in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
> +                          mu_Q9, max_gain_Q7, L))
> +
> +#else
> +
>  extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
>      opus_int8                   *ind,                           /* O   
index of best codebook vector               */
>      opus_int32                  *rate_dist_Q14,                 /* O   
best weighted quant error + mu * rate       */
> @@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK +
1])(
>      ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind,
rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
>                            mu_Q9, max_gain_Q7, L))
>
> +#endif
> +
>  #  define OVERRIDE_silk_NSQ
>
>  void silk_NSQ_sse4_1(
> @@ -89,6 +100,15 @@ void silk_NSQ_sse4_1(
>      const opus_int              LTP_scale_Q14                             
/* I    LTP state scaling               */
>  );
>
> +#if defined OPUS_X86_PRESUME_SSE4_1
> +
> +#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12,
LTPCoef_Q14, AR2_Q13, \
> +                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
> +    ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses,
PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
> +                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
pitchL, Lambda_Q10, LTP_scale_Q14))
> +
> +#else
> +
>  extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
>      const silk_encoder_state    *psEncC,                                  
/* I/O  Encoder State                   */
>      silk_nsq_state              *NSQ,                                     
/* I/O  NSQ state                       */
> @@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
>      ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices,
x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
>                     HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
pitchL, Lambda_Q10, LTP_scale_Q14))
>
> +#endif
> +
>  #  define OVERRIDE_silk_NSQ_del_dec
>
>  void silk_NSQ_del_dec_sse4_1(
> @@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1(
>      const opus_int              LTP_scale_Q14                             
/* I    LTP state scaling               */
>  );
>
> +#if defined OPUS_X86_PRESUME_SSE4_1
> +
> +#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses,
PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
> +                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14,
Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
> +    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3,
pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
> +                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14,
Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
> +
> +#else
> +
>  extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
>      const silk_encoder_state    *psEncC,                                  
/* I/O  Encoder State                   */
>      silk_nsq_state              *NSQ,                                     
/* I/O  NSQ state                       */
> @@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK
+ 1])(
>      ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ,
psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
>                             HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14,
Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
>
> +#endif
> +
>  void silk_noise_shape_quantizer(
>      silk_nsq_state      *NSQ,                   /* I/O  NSQ state         
*/
>      opus_int            signalType,             /* I    Signal type       
*/
> @@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(
>      const opus_int16   pIn[]
>  );
>
> +#if defined(OPUS_X86_PRESUME_SSE4_1)
> +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch)
((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
> +
> +#else
> +
>  #  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
>       ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
>
> @@ -201,6 +239,8 @@ extern opus_int (*const
SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
>
>  #  define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
>
> +#endif
> +
>  void silk_warped_LPC_analysis_filter_FIX_sse4_1(
>            opus_int32            state[],                    /* I/O  State
[order + 1]                   */
>            opus_int32            res_Q2[],                   /* O   
Residual signal [length]            */
> @@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1(
>      const opus_int              order                       /* I    Filter
order (even)                 */
>  );
>
> +#if defined(OPUS_X86_PRESUME_SSE4_1)
> +#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13,
input, lambda_Q16, length, order, arch) \
> +    ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2,
coef_Q13, input, lambda_Q16, length, order))
> +
> +#else
> +
>  extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK
+ 1])(
>            opus_int32            state[],                    /* I/O  State
[order + 1]                   */
>            opus_int32            res_Q2[],                   /* O   
Residual signal [length]            */
> @@ -224,5 +270,7 @@ extern void (*const
SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])
>  #  define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13,
input, lambda_Q16, length, order, arch) \
>      ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) &
OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
>
> +#endif
> +
>  # endif
>  #endif
> diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
> index 6747d10..ad9fef2 100644
> --- a/silk/x86/x86_silk_map.c
> +++ b/silk/x86/x86_silk_map.c
> @@ -35,6 +35,10 @@
>  #include "pitch.h"
>  #include "main.h"
>
> +#if !defined(OPUS_X86_PRESUME_SSE4_1)
> +
> +#if defined(FIXED_POINT)
> +
>  opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ]
)(
>      const opus_int16 *inVec1,
>      const opus_int16 *inVec2,
> @@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[
OPUS_ARCHMASK + 1 ] )(
>  ) = {
>    silk_inner_prod16_aligned_64_c,                  /* non-sse */
>    silk_inner_prod16_aligned_64_c,
> +  silk_inner_prod16_aligned_64_c,
>    MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
> -  NULL
>  };
>
> +#endif
> +
>  opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
>      silk_encoder_state *psEncC,
>      const opus_int16   pIn[]
>  ) = {
>    silk_VAD_GetSA_Q8_c,                  /* non-sse */
>    silk_VAD_GetSA_Q8_c,
> +  silk_VAD_GetSA_Q8_c,
>    MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
> -  NULL
>  };
>
>  void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
> @@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
>  ) = {
>    silk_NSQ_c,                  /* non-sse */
>    silk_NSQ_c,
> +  silk_NSQ_c,
>    MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
> -  NULL
>  };
>
>  void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
> @@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ]
)(
>  ) = {
>    silk_VQ_WMat_EC_c,                  /* non-sse */
>    silk_VQ_WMat_EC_c,
> +  silk_VQ_WMat_EC_c,
>    MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
> -  NULL
>  };
>
>  void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
> @@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1
] )(
>  ) = {
>    silk_NSQ_del_dec_c,                  /* non-sse */
>    silk_NSQ_del_dec_c,
> +  silk_NSQ_del_dec_c,
>    MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
> -  NULL
>  };
>
> +#if defined(FIXED_POINT)
> +
>  void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ]
)(
>      opus_int32                  state[],                    /* I/O  State
[order + 1]                   */
>      opus_int32                  res_Q2[],                   /* O   
Residual signal [length]            */
> @@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[
OPUS_ARCHMASK + 1 ] )(
>  ) = {
>    silk_warped_LPC_analysis_filter_FIX_c,                  /* non-sse */
>    silk_warped_LPC_analysis_filter_FIX_c,
> +  silk_warped_LPC_analysis_filter_FIX_c,
>    MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
> -  NULL
>  };
>
>  void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
> @@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1
] )(
>  ) = {
>    silk_burg_modified_c,                  /* non-sse */
>    silk_burg_modified_c,
> +  silk_burg_modified_c,
>    MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
> -  NULL
>  };
> +
> +#endif
> +#endif
> diff --git a/win32/VS2010/celt.vcxproj b/win32/VS2010/celt.vcxproj
> index f107fec..e068fbe 100644
> --- a/win32/VS2010/celt.vcxproj
> +++ b/win32/VS2010/celt.vcxproj
> @@ -37,6 +37,12 @@
>      <ClCompile Include="..\..\celt\quant_bands.c" />
>      <ClCompile Include="..\..\celt\rate.c" />
>      <ClCompile Include="..\..\celt\vq.c" />
> +    <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c" />
> +    <ClCompile Include="..\..\celt\x86\pitch_sse.c" />
> +    <ClCompile Include="..\..\celt\x86\pitch_sse2.c" />
> +    <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c" />
> +    <ClCompile Include="..\..\celt\x86\x86cpu.c" />
> +    <ClCompile Include="..\..\celt\x86\x86_celt_map.c" />
>    </ItemGroup>
>    <ItemGroup>
>      <ClInclude Include="..\..\celt\arch.h" />
> @@ -67,6 +73,9 @@
>      <ClInclude Include="..\..\celt\static_modes_fixed.h"
/>
>      <ClInclude Include="..\..\celt\static_modes_float.h"
/>
>      <ClInclude Include="..\..\celt\vq.h" />
> +    <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h" />
> +    <ClInclude Include="..\..\celt\x86\pitch_sse.h" />
> +    <ClInclude Include="..\..\celt\x86\x86cpu.h" />
>      <ClInclude Include="..\..\celt\_kiss_fft_guts.h" />
>    </ItemGroup>
>    <PropertyGroup Label="Globals">
> @@ -141,7 +150,7 @@
>        <WarningLevel>Level3</WarningLevel>
>        <Optimization>Disabled</Optimization>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -168,7 +177,7 @@
>        <WarningLevel>Level3</WarningLevel>
>        <Optimization>Disabled</Optimization>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -196,7 +205,7 @@
>        <FunctionLevelLinking>true</FunctionLevelLinking>
>        <IntrinsicFunctions>true</IntrinsicFunctions>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -227,7 +236,7 @@
>        <FunctionLevelLinking>true</FunctionLevelLinking>
>        <IntrinsicFunctions>true</IntrinsicFunctions>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)\..\;$(ProjectDir)\..\..\include;$(ProjectDir)\..\..\celt;$(ProjectDir)\..\..\silk;$(ProjectDir)\..\..\silk\float;$(ProjectDir)\..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> diff --git a/win32/VS2010/celt.vcxproj.filters
b/win32/VS2010/celt.vcxproj.filters
> index e3a1d97..e9948fa 100644
> --- a/win32/VS2010/celt.vcxproj.filters
> +++ b/win32/VS2010/celt.vcxproj.filters
> @@ -69,6 +69,24 @@
>      <ClCompile Include="..\..\celt\celt.c">
>        <Filter>Source Files</Filter>
>      </ClCompile>
> +    <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\celt\x86\pitch_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\celt\x86\pitch_sse2.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\celt\x86\x86_celt_map.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\celt\x86\x86cpu.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
>    </ItemGroup>
>    <ItemGroup>
>      <ClInclude Include="..\..\celt\cwrs.h">
> @@ -158,5 +176,14 @@
>      <ClInclude Include="..\..\celt\celt_lpc.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
> +    <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h">
> +      <Filter>Header Files</Filter>
> +    </ClInclude>
> +    <ClInclude Include="..\..\celt\x86\pitch_sse.h">
> +      <Filter>Header Files</Filter>
> +    </ClInclude>
> +    <ClInclude Include="..\..\celt\x86\x86cpu.h">
> +      <Filter>Header Files</Filter>
> +    </ClInclude>
>    </ItemGroup>
>  </Project>
> \ No newline at end of file
> diff --git a/win32/VS2010/silk_common.vcxproj
b/win32/VS2010/silk_common.vcxproj
> index 9cf5f48..d3d077d 100644
> --- a/win32/VS2010/silk_common.vcxproj
> +++ b/win32/VS2010/silk_common.vcxproj
> @@ -88,7 +88,7 @@
>        <WarningLevel>Level3</WarningLevel>
>        <Optimization>Disabled</Optimization>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -118,7 +118,7 @@
>        <WarningLevel>Level3</WarningLevel>
>        <Optimization>Disabled</Optimization>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -149,7 +149,7 @@
>        <FunctionLevelLinking>true</FunctionLevelLinking>
>        <IntrinsicFunctions>true</IntrinsicFunctions>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
>        <FloatingPointModel>Fast</FloatingPointModel>
>      </ClCompile>
> @@ -184,7 +184,7 @@
>        <FunctionLevelLinking>true</FunctionLevelLinking>
>        <IntrinsicFunctions>true</IntrinsicFunctions>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk/float;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
>        <FloatingPointModel>Fast</FloatingPointModel>
>      </ClCompile>
> @@ -212,6 +212,8 @@
>    </ItemDefinitionGroup>
>    <ItemGroup>
>      <ClInclude Include="..\..\include\opus_types.h" />
> +    <ClInclude Include="..\..\silk\x86\main_sse.h" />
> +    <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h"
/>
>      <ClInclude Include="..\..\win32\config.h" />
>      <ClInclude Include="..\..\silk\control.h" />
>      <ClInclude Include="..\..\silk\debug.h" />
> @@ -311,8 +313,13 @@
>      <ClCompile Include="..\..\silk\table_LSF_cos.c" />
>      <ClCompile Include="..\..\silk\VAD.c" />
>      <ClCompile Include="..\..\silk\VQ_WMat_EC.c" />
> +    <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c"
/>
> +    <ClCompile Include="..\..\silk\x86\NSQ_sse.c" />
> +    <ClCompile Include="..\..\silk\x86\VAD_sse.c" />
> +    <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c"
/>
> +    <ClCompile Include="..\..\silk\x86\x86_silk_map.c" />
>    </ItemGroup>
>    <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets"
/>
>    <ImportGroup Label="ExtensionTargets">
>    </ImportGroup>
> -</Project>
> +</Project>
> \ No newline at end of file
> diff --git a/win32/VS2010/silk_common.vcxproj.filters
b/win32/VS2010/silk_common.vcxproj.filters
> index 30db48e..341180b 100644
> --- a/win32/VS2010/silk_common.vcxproj.filters
> +++ b/win32/VS2010/silk_common.vcxproj.filters
> @@ -81,6 +81,12 @@
>      <ClInclude Include="..\..\silk\typedef.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
> +    <ClInclude Include="..\..\silk\x86\main_sse.h">
> +      <Filter>Header Files</Filter>
> +    </ClInclude>
> +    <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h">
> +      <Filter>Header Files</Filter>
> +    </ClInclude>
>    </ItemGroup>
>    <ItemGroup>
>      <ClCompile Include="..\..\silk\VQ_WMat_EC.c">
> @@ -311,5 +317,20 @@
>      <ClCompile Include="..\..\silk\VAD.c">
>        <Filter>Source Files</Filter>
>      </ClCompile>
> +    <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\silk\x86\NSQ_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\silk\x86\VAD_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile Include="..\..\silk\x86\x86_silk_map.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
>    </ItemGroup>
> -</Project>
> +</Project>
> \ No newline at end of file
> diff --git a/win32/VS2010/silk_fixed.vcxproj
b/win32/VS2010/silk_fixed.vcxproj
> index 5ea1a91..522101e 100644
> --- a/win32/VS2010/silk_fixed.vcxproj
> +++ b/win32/VS2010/silk_fixed.vcxproj
> @@ -86,7 +86,7 @@
>        <WarningLevel>Level3</WarningLevel>
>        <Optimization>Disabled</Optimization>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -104,7 +104,7 @@
>        <WarningLevel>Level3</WarningLevel>
>        <Optimization>Disabled</Optimization>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -123,7 +123,7 @@
>        <FunctionLevelLinking>true</FunctionLevelLinking>
>        <IntrinsicFunctions>true</IntrinsicFunctions>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -145,7 +145,7 @@
>        <FunctionLevelLinking>true</FunctionLevelLinking>
>        <IntrinsicFunctions>true</IntrinsicFunctions>
>       
<PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
> -     
<AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
> +     
<AdditionalIncludeDirectories>$(ProjectDir)/../..;$(ProjectDir)/../../silk/fixed;$(ProjectDir)/../../silk;$(ProjectDir)/../../win32;$(ProjectDir)/../../celt;$(ProjectDir)/../../include;$(ProjectDir)/../win32</AdditionalIncludeDirectories>
>        <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
>      </ClCompile>
>      <Link>
> @@ -191,8 +191,11 @@
>      <ClCompile Include="..\..\silk\fixed\solve_LS_FIX.c"
/>
>      <ClCompile Include="..\..\silk\fixed\vector_ops_FIX.c"
/>
>      <ClCompile
Include="..\..\silk\fixed\warped_autocorrelation_FIX.c" />
> +    <ClCompile
Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c" />
> +    <ClCompile
Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c" />
> +    <ClCompile
Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c" />
>    </ItemGroup>
>    <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets"
/>
>    <ImportGroup Label="ExtensionTargets">
>    </ImportGroup>
> -</Project>
> +</Project>
> \ No newline at end of file
> diff --git a/win32/VS2010/silk_fixed.vcxproj.filters
b/win32/VS2010/silk_fixed.vcxproj.filters
> index 6897930..c2327eb 100644
> --- a/win32/VS2010/silk_fixed.vcxproj.filters
> +++ b/win32/VS2010/silk_fixed.vcxproj.filters
> @@ -18,16 +18,16 @@
>      <ClInclude Include="..\..\win32\config.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
> -    <ClInclude Include="main_FIX.h">
> +    <ClInclude Include="..\..\include\opus_types.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
> -    <ClInclude Include="..\SigProc_FIX.h">
> +    <ClInclude Include="..\..\silk\SigProc_FIX.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
> -    <ClInclude Include="structs_FIX.h">
> +    <ClInclude Include="..\..\silk\fixed\main_FIX.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
> -    <ClInclude Include="..\..\include\opus_types.h">
> +    <ClInclude Include="..\..\silk\fixed\structs_FIX.h">
>        <Filter>Header Files</Filter>
>      </ClInclude>
>    </ItemGroup>
> @@ -107,5 +107,14 @@
>      <ClCompile
Include="..\..\silk\fixed\LTP_analysis_filter_FIX.c">
>        <Filter>Source Files</Filter>
>      </ClCompile>
> +    <ClCompile
Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile
Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
> +    <ClCompile
Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c">
> +      <Filter>Source Files</Filter>
> +    </ClCompile>
>    </ItemGroup>
>  </Project>
> \ No newline at end of file
> diff --git a/win32/config.h b/win32/config.h
> index 46ff699..10fbf33 100644
> --- a/win32/config.h
> +++ b/win32/config.h
> @@ -35,9 +35,28 @@ POSSIBILITY OF SUCH DAMAGE.
>
>  #define OPUS_BUILD            1
>
> -/* Enable SSE functions, if compiled with SSE/SSE2 (note that AMD64
implies SSE2) */
> -#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >=
1))
> -#define __SSE__               1
> +#if defined(_M_IX86) || defined(_M_X64)
> +/* Can always build with SSE intrinsics (no special compiler flags
necessary) */
> +#define OPUS_X86_MAY_HAVE_SSE
> +#define OPUS_X86_MAY_HAVE_SSE2
> +#define OPUS_X86_MAY_HAVE_SSE4_1
> +
> +/* Presume SSE functions, if compiled with SSE/SSE2/AVX (note that AMD64
implies SSE2, and AVX
> +   implies SSE4.1) */
> +#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >=
1)) || defined(__AVX__)
> +#define OPUS_X86_PRESUME_SSE 1
> +#endif
> +#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >=
2)) || defined(__AVX__)
> +#define OPUS_X86_PRESUME_SSE2 1
> +#endif
> +#if defined(__AVX__)
> +#define OPUS_X86_PRESUME_SSE4_1 1
> +#endif
> +
> +#if !defined(OPUS_X86_PRESUME_SSE4_1) || !defined(OPUS_X86_PRESUME_SSE2)
|| !defined(OPUS_X86_PRESUME_SSE)
> +#define OPUS_HAVE_RTCD 1
> +#endif
> +
>  #endif
>
>  #include "version.h"
> --
> 1.9.1
>

Possibly Parallel Threads

Search for more apparently analagous threads

opus - Mar 2015 - [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.

[opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.

[opus] [RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.

Possibly Parallel Threads