b=1042508 move resampler simd optimizations to separate translation units r=padenot
authorKarl Tomlinson <karlt+@karlt.net>
Wed, 23 Jul 2014 21:49:04 +1200
changeset 197635 82f2fe2768d56b6181ce89610558049a457ffca2
parent 197634 2566329e44b954a9ac17788bef9f0d85eb38e552
child 197636 a2404857df4479501d422be6895758ae2d96cf93
push id27249
push userryanvm@gmail.com
push dateMon, 04 Aug 2014 20:14:35 +0000
treeherdermozilla-central@7f81be7db528 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerspadenot
bugs1042508
milestone34.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
b=1042508 move resampler simd optimizations to separate translation units r=padenot This ensures that code in resample.c will run on Intel x86 cpus even when SSE support has been compiled, and will provide similarly for neon support when enabled.
media/libspeex_resampler/simd-detect-runtime.patch
media/libspeex_resampler/src/moz.build
media/libspeex_resampler/src/resample.c
media/libspeex_resampler/src/resample_neon.c
media/libspeex_resampler/src/resample_neon.h
media/libspeex_resampler/src/resample_sse.c
media/libspeex_resampler/src/resample_sse.h
media/libspeex_resampler/src/simd_detect.cpp
media/libspeex_resampler/src/simd_detect.h
media/libspeex_resampler/src/sse_detect.cpp
media/libspeex_resampler/src/sse_detect.h
media/libspeex_resampler/sse-detect-runtime.patch
media/libspeex_resampler/update.sh
rename from media/libspeex_resampler/sse-detect-runtime.patch
rename to media/libspeex_resampler/simd-detect-runtime.patch
--- a/media/libspeex_resampler/sse-detect-runtime.patch
+++ b/media/libspeex_resampler/simd-detect-runtime.patch
@@ -1,62 +1,58 @@
 diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
 --- a/media/libspeex_resampler/src/resample.c
 +++ b/media/libspeex_resampler/src/resample.c
-@@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
+@@ -92,23 +92,17 @@ static void speex_free (void *ptr) {free
                 
  #define IMAX(a,b) ((a) > (b) ? (a) : (b))
  #define IMIN(a,b) ((a) < (b) ? (a) : (b))
  
  #ifndef NULL
  #define NULL 0
  #endif
  
-+#include "sse_detect.h"
-+
-+/* We compile SSE code on x86 all the time, but we only use it if we find at
-+ * runtime that the CPU supports it. */
- #ifdef _USE_SSE
-+#ifdef _MSC_VER
-+#define inline __inline
-+#endif
- #include "resample_sse.h"
-+#ifdef _MSC_VER
-+#undef inline
-+#endif
- #endif
- 
- #ifdef _USE_NEON
- #include "resample_neon.h"
- #endif
+-#ifdef _USE_SSE
+-#include "resample_sse.h"
+-#endif
+-
+-#ifdef _USE_NEON
+-#include "resample_neon.h"
+-#endif
++#include "simd_detect.h"
  
  /* Numer of elements to allocate on the stack */
  #ifdef VAR_ARRAYS
-@@ -342,17 +352,19 @@ static int resampler_basic_direct_single
+ #define FIXED_STACK_ALLOC 8192
+ #else
+ #define FIXED_STACK_ALLOC 1024
+ #endif
+ 
+@@ -344,17 +338,19 @@ static int resampler_basic_direct_single
     const spx_uint32_t den_rate = st->den_rate;
     spx_word32_t sum;
  
     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
     {
        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
        const spx_word16_t *iptr = & in[last_sample];
  
 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-+      if (!moz_has_sse()) {
++      if (!moz_speex_have_single_simd()) {
 +#endif
        int j;
        sum = 0;
        for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
  
  /*    This code is slower on most DSPs which have only 2 accumulators.
        Plus this this forces truncation to 32 bits and you lose the HW guard bits.
        I think we can trust the compiler and let it vectorize and/or unroll itself.
        spx_word32_t accum[4] = {0,0,0,0};
-@@ -360,18 +372,20 @@ static int resampler_basic_direct_single
+@@ -362,18 +358,20 @@ static int resampler_basic_direct_single
          accum[0] += MULT16_16(sinct[j], iptr[j]);
          accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
          accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
          accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
        }
        sum = accum[0] + accum[1] + accum[2] + accum[3];
  */
        sum = SATURATE32PSHR(sum, 15, 32767);
@@ -68,28 +64,28 @@ diff --git a/media/libspeex_resampler/sr
  #endif
  
        out[out_stride * out_sample++] = sum;
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-@@ -400,29 +414,33 @@ static int resampler_basic_direct_double
+@@ -402,29 +400,33 @@ static int resampler_basic_direct_double
     const spx_uint32_t den_rate = st->den_rate;
     double sum;
  
     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
     {
        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
        const spx_word16_t *iptr = & in[last_sample];
  
 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
-+      if(moz_has_sse2()) {
++      if(moz_speex_have_double_simd()) {
 +#endif
        int j;
        double accum[4] = {0,0,0,0};
  
        for(j=0;j<N;j+=4) {
          accum[0] += sinct[j]*iptr[j];
          accum[1] += sinct[j+1]*iptr[j+1];
          accum[2] += sinct[j+2]*iptr[j+2];
@@ -104,28 +100,28 @@ diff --git a/media/libspeex_resampler/sr
  #endif
  
        out[out_stride * out_sample++] = PSHR32(sum, 15);
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-@@ -456,34 +474,38 @@ static int resampler_basic_interpolate_s
+@@ -458,34 +460,38 @@ static int resampler_basic_interpolate_s
  #ifdef FIXED_POINT
        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
  #else
        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
  #endif
        spx_word16_t interp[4];
  
  
 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-+      if (!moz_has_sse()) {
++      if (!moz_speex_have_single_simd()) {
 +#endif
        int j;
        spx_word32_t accum[4] = {0,0,0,0};
  
        for(j=0;j<N;j++) {
          const spx_word16_t curr_in=iptr[j];
          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
@@ -145,28 +141,28 @@ diff --git a/media/libspeex_resampler/sr
  #endif
        
        out[out_stride * out_sample++] = sum;
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-@@ -519,33 +541,37 @@ static int resampler_basic_interpolate_d
+@@ -521,33 +527,37 @@ static int resampler_basic_interpolate_d
  #ifdef FIXED_POINT
        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
  #else
        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
  #endif
        spx_word16_t interp[4];
  
  
 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-+      if (!moz_has_sse2()) {
++      if (!moz_speex_have_double_simd()) {
 +#endif
        int j;
        double accum[4] = {0,0,0,0};
  
        for(j=0;j<N;j++) {
          const double curr_in=iptr[j];
          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
@@ -185,8 +181,151 @@ diff --git a/media/libspeex_resampler/sr
  #endif
        
        out[out_stride * out_sample++] = PSHR32(sum,15);
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
+diff --git a/media/libspeex_resampler/src/resample_neon.c b/media/libspeex_resampler/src/resample_neon.c
+--- a/media/libspeex_resampler/src/resample_neon.c
++++ b/media/libspeex_resampler/src/resample_neon.c
+@@ -31,16 +31,18 @@
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
++#include "simd_detect.h"
++
+ #include <arm_neon.h>
+ 
+ #ifdef FIXED_POINT
+ #ifdef __thumb2__
+ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
+     int32_t ret;
+     asm ("ssat %[ret], #16, %[a]"
+          : [ret] "=&r" (ret)
+@@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
+     return ret;
+ }
+ #endif
+ #undef WORD2INT
+ #define WORD2INT(x) (saturate_32bit_to_16bit(x))
+ 
+ #define OVERRIDE_INNER_PRODUCT_SINGLE
+ /* Only works when len % 4 == 0 */
+-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
++int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+ {
+     int32_t ret;
+     uint32_t remainder = len % 16;
+     len = len - remainder;
+ 
+     asm volatile ("	 cmp %[len], #0\n"
+ 		  "	 bne 1f\n"
+ 		  "	 vld1.16 {d16}, [%[b]]!\n"
+@@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
+          : "q0");
+     return ret;
+ }
+ #undef WORD2INT
+ #define WORD2INT(x) (saturate_float_to_16bit(x))
+ 
+ #define OVERRIDE_INNER_PRODUCT_SINGLE
+ /* Only works when len % 4 == 0 */
+-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
++float inner_product_single(const float *a, const float *b, unsigned int len)
+ {
+     float ret;
+     uint32_t remainder = len % 16;
+     len = len - remainder;
+ 
+     asm volatile ("	 cmp %[len], #0\n"
+ 		  "	 bne 1f\n"
+ 		  "	 vld1.32 {q4}, [%[b]]!\n"
+diff --git a/media/libspeex_resampler/src/resample_sse.c b/media/libspeex_resampler/src/resample_sse.c
+--- a/media/libspeex_resampler/src/resample_sse.c
++++ b/media/libspeex_resampler/src/resample_sse.c
+@@ -29,37 +29,39 @@
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
++#include "simd_detect.h"
++
+ #include <xmmintrin.h>
+ 
+ #define OVERRIDE_INNER_PRODUCT_SINGLE
+-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
++float inner_product_single(const float *a, const float *b, unsigned int len)
+ {
+    int i;
+    float ret;
+    __m128 sum = _mm_setzero_ps();
+    for (i=0;i<len;i+=8)
+    {
+       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
+       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
+    }
+    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+    _mm_store_ss(&ret, sum);
+    return ret;
+ }
+ 
+ #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+-static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
++float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+   int i;
+   float ret;
+   __m128 sum = _mm_setzero_ps();
+   __m128 f = _mm_loadu_ps(frac);
+   for(i=0;i<len;i+=2)
+   {
+     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
+     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
+@@ -70,17 +72,17 @@ static inline float interpolate_product_
+    _mm_store_ss(&ret, sum);
+    return ret;
+ }
+ 
+ #ifdef _USE_SSE2
+ #include <emmintrin.h>
+ #define OVERRIDE_INNER_PRODUCT_DOUBLE
+ 
+-static inline double inner_product_double(const float *a, const float *b, unsigned int len)
++double inner_product_double(const float *a, const float *b, unsigned int len)
+ {
+    int i;
+    double ret;
+    __m128d sum = _mm_setzero_pd();
+    __m128 t;
+    for (i=0;i<len;i+=8)
+    {
+       t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
+@@ -92,17 +94,17 @@ static inline double inner_product_doubl
+       sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+    }
+    sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
+    _mm_store_sd(&ret, sum);
+    return ret;
+ }
+ 
+ #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+-static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
++double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+   int i;
+   double ret;
+   __m128d sum;
+   __m128d sum1 = _mm_setzero_pd();
+   __m128d sum2 = _mm_setzero_pd();
+   __m128 f = _mm_loadu_ps(frac);
+   __m128d f1 = _mm_cvtps_pd(f);
+   __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
--- a/media/libspeex_resampler/src/moz.build
+++ b/media/libspeex_resampler/src/moz.build
@@ -6,21 +6,18 @@
 
 LIBRARY_NAME = 'speex'
 
 EXPORTS.speex += [
     'speex_resampler.h',
 ]
 
 SOURCES += [
-  'sse_detect.cpp',
-]
-
-SOURCES += [
     'resample.c',
+    'simd_detect.cpp',
 ]
 
 MSVC_ENABLE_PGO = True
 
 FINAL_LIBRARY = 'gkmedias'
 
 # We don't compile the full speex codec, only the resampler.
 DEFINES['OUTSIDE_SPEEX'] = True
@@ -35,13 +32,16 @@ if CONFIG['MOZ_SAMPLE_TYPE_S16']:
     DEFINES['FIXED_POINT'] = True
 else:
     DEFINES['FLOATING_POINT'] = True
 
 # Only use SSE code when using floating point samples, and on x86
 if CONFIG['INTEL_ARCHITECTURE'] and not CONFIG['MOZ_SAMPLE_TYPE_S16']:
     DEFINES['_USE_SSE'] = True
     DEFINES['_USE_SSE2'] = True
-    SOURCES['resample.c'].flags += CONFIG['SSE2_FLAGS']
+    SOURCES += [
+        'resample_sse.c'
+    ]
+    SOURCES['resample_sse.c'].flags += CONFIG['SSE2_FLAGS']
 
 # Suppress warnings in third-party code.
 if CONFIG['GNU_CC']:
     CFLAGS += ['-Wno-sign-compare']
--- a/media/libspeex_resampler/src/resample.c
+++ b/media/libspeex_resampler/src/resample.c
@@ -92,33 +92,17 @@ static void speex_free (void *ptr) {free
                
 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
 
 #ifndef NULL
 #define NULL 0
 #endif
 
-#include "sse_detect.h"
-
-/* We compile SSE code on x86 all the time, but we only use it if we find at
- * runtime that the CPU supports it. */
-#ifdef _USE_SSE
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-#include "resample_sse.h"
-#ifdef _MSC_VER
-#undef inline
-#endif
-#endif
-
-#ifdef _USE_NEON
-#include "resample_neon.h"
-#endif
+#include "simd_detect.h"
 
 /* Numer of elements to allocate on the stack */
 #ifdef VAR_ARRAYS
 #define FIXED_STACK_ALLOC 8192
 #else
 #define FIXED_STACK_ALLOC 1024
 #endif
 
@@ -355,17 +339,17 @@ static int resampler_basic_direct_single
    spx_word32_t sum;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
 #ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-      if (!moz_has_sse()) {
+      if (!moz_speex_have_single_simd()) {
 #endif
       int j;
       sum = 0;
       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
 
 /*    This code is slower on most DSPs which have only 2 accumulators.
       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
       I think we can trust the compiler and let it vectorize and/or unroll itself.
@@ -417,17 +401,17 @@ static int resampler_basic_direct_double
    double sum;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
 #ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
-      if(moz_has_sse2()) {
+      if(moz_speex_have_double_simd()) {
 #endif
       int j;
       double accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j+=4) {
         accum[0] += sinct[j]*iptr[j];
         accum[1] += sinct[j+1]*iptr[j+1];
         accum[2] += sinct[j+2]*iptr[j+2];
@@ -477,17 +461,17 @@ static int resampler_basic_interpolate_s
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-      if (!moz_has_sse()) {
+      if (!moz_speex_have_single_simd()) {
 #endif
       int j;
       spx_word32_t accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j++) {
         const spx_word16_t curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
@@ -544,17 +528,17 @@ static int resampler_basic_interpolate_d
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-      if (!moz_has_sse2()) {
+      if (!moz_speex_have_double_simd()) {
 #endif
       int j;
       double accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j++) {
         const double curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
rename from media/libspeex_resampler/src/resample_neon.h
rename to media/libspeex_resampler/src/resample_neon.c
--- a/media/libspeex_resampler/src/resample_neon.h
+++ b/media/libspeex_resampler/src/resample_neon.c
@@ -31,16 +31,18 @@
    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "simd_detect.h"
+
 #include <arm_neon.h>
 
 #ifdef FIXED_POINT
 #ifdef __thumb2__
 static inline int32_t saturate_32bit_to_16bit(int32_t a) {
     int32_t ret;
     asm ("ssat %[ret], #16, %[a]"
          : [ret] "=&r" (ret)
@@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
     return ret;
 }
 #endif
 #undef WORD2INT
 #define WORD2INT(x) (saturate_32bit_to_16bit(x))
 
 #define OVERRIDE_INNER_PRODUCT_SINGLE
 /* Only works when len % 4 == 0 */
-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 {
     int32_t ret;
     uint32_t remainder = len % 16;
     len = len - remainder;
 
     asm volatile ("	 cmp %[len], #0\n"
 		  "	 bne 1f\n"
 		  "	 vld1.16 {d16}, [%[b]]!\n"
@@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
          : "q0");
     return ret;
 }
 #undef WORD2INT
 #define WORD2INT(x) (saturate_float_to_16bit(x))
 
 #define OVERRIDE_INNER_PRODUCT_SINGLE
 /* Only works when len % 4 == 0 */
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+float inner_product_single(const float *a, const float *b, unsigned int len)
 {
     float ret;
     uint32_t remainder = len % 16;
     len = len - remainder;
 
     asm volatile ("	 cmp %[len], #0\n"
 		  "	 bne 1f\n"
 		  "	 vld1.32 {q4}, [%[b]]!\n"
rename from media/libspeex_resampler/src/resample_sse.h
rename to media/libspeex_resampler/src/resample_sse.c
--- a/media/libspeex_resampler/src/resample_sse.h
+++ b/media/libspeex_resampler/src/resample_sse.c
@@ -29,37 +29,39 @@
    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "simd_detect.h"
+
 #include <xmmintrin.h>
 
 #define OVERRIDE_INNER_PRODUCT_SINGLE
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+float inner_product_single(const float *a, const float *b, unsigned int len)
 {
    int i;
    float ret;
    __m128 sum = _mm_setzero_ps();
    for (i=0;i<len;i+=8)
    {
       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
    }
    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
    _mm_store_ss(&ret, sum);
    return ret;
 }
 
 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
   int i;
   float ret;
   __m128 sum = _mm_setzero_ps();
   __m128 f = _mm_loadu_ps(frac);
   for(i=0;i<len;i+=2)
   {
     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
@@ -70,17 +72,17 @@ static inline float interpolate_product_
    _mm_store_ss(&ret, sum);
    return ret;
 }
 
 #ifdef _USE_SSE2
 #include <emmintrin.h>
 #define OVERRIDE_INNER_PRODUCT_DOUBLE
 
-static inline double inner_product_double(const float *a, const float *b, unsigned int len)
+double inner_product_double(const float *a, const float *b, unsigned int len)
 {
    int i;
    double ret;
    __m128d sum = _mm_setzero_pd();
    __m128 t;
    for (i=0;i<len;i+=8)
    {
       t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
@@ -92,17 +94,17 @@ static inline double inner_product_doubl
       sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
    }
    sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
    _mm_store_sd(&ret, sum);
    return ret;
 }
 
 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
   int i;
   double ret;
   __m128d sum;
   __m128d sum1 = _mm_setzero_pd();
   __m128d sum2 = _mm_setzero_pd();
   __m128 f = _mm_loadu_ps(frac);
   __m128d f1 = _mm_cvtps_pd(f);
   __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
rename from media/libspeex_resampler/src/sse_detect.cpp
rename to media/libspeex_resampler/src/simd_detect.cpp
--- a/media/libspeex_resampler/src/sse_detect.cpp
+++ b/media/libspeex_resampler/src/simd_detect.cpp
@@ -1,15 +1,27 @@
 /* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#include "simd_detect.h"
+
 #include "mozilla/SSE.h"
-#include "sse_detect.h"
+#include "mozilla/arm.h"
 
-int moz_has_sse2() {
+#ifdef _USE_SSE2
+int moz_speex_have_double_simd() {
   return mozilla::supports_sse2() ? 1 : 0;
 }
+#endif
 
-int moz_has_sse() {
+#ifdef _USE_SSE
+int moz_speex_have_single_simd() {
   return mozilla::supports_sse() ? 1 : 0;
 }
+#endif
+
+#ifdef _USE_NEON
+int moz_speex_have_single_simd() {
+  return mozilla::supports_neon() ? 1 : 0;
+}
+#endif
rename from media/libspeex_resampler/src/sse_detect.h
rename to media/libspeex_resampler/src/simd_detect.h
--- a/media/libspeex_resampler/src/sse_detect.h
+++ b/media/libspeex_resampler/src/simd_detect.h
@@ -1,20 +1,43 @@
 /* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#ifndef SSE_DETECT
-#define SSE_DETECT
+#ifndef simd_detect_h
+#define simd_detect_h
+
+#include "speex_resampler.h"
+#include "arch.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-  int moz_has_sse2();
-  int moz_has_sse();
+int moz_speex_have_single_simd();
+int moz_speex_have_double_simd();
+
+#if defined(_USE_SSE) || defined(_USE_NEON)
+#define OVERRIDE_INNER_PRODUCT_SINGLE
+#define inner_product_single CAT_PREFIX(RANDOM_PREFIX,_inner_product_single)
+spx_word32_t inner_product_single(const spx_word16_t *a, const spx_word16_t *b, unsigned int len);
+#endif
+#if defined(_USE_SSE)
+#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+#define interpolate_product_single CAT_PREFIX(RANDOM_PREFIX,_interpolate_product_single)
+spx_word32_t interpolate_product_single(const spx_word16_t *a, const spx_word16_t *b, unsigned int len, const spx_uint32_t oversample, float *frac);
+#endif
+
+#if defined(_USE_SSE2)
+#define OVERRIDE_INNER_PRODUCT_DOUBLE
+#define inner_product_double CAT_PREFIX(RANDOM_PREFIX,_inner_product_double)
+double inner_product_double(const float *a, const float *b, unsigned int len);
+#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+#define interpolate_product_double CAT_PREFIX(RANDOM_PREFIX,_interpolate_product_double)
+double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac);
+#endif
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif // SSE_DETECT
+#endif // simd_detect_h
--- a/media/libspeex_resampler/update.sh
+++ b/media/libspeex_resampler/update.sh
@@ -5,22 +5,22 @@
 # Usage: ./update.sh <speexdsp_directory>
 #
 # Copies the needed files from a directory containing the original
 # speexdsp sources.
 
 set -e -x
 
 cp $1/libspeexdsp/resample.c src
-cp $1/libspeexdsp/resample_sse.h src
-cp $1/libspeexdsp/resample_neon.h src
+cp $1/libspeexdsp/resample_sse.h src/resample_sse.c
+cp $1/libspeexdsp/resample_neon.h src/resample_neon.c
 cp $1/libspeexdsp/arch.h src
 cp $1/libspeexdsp/stack_alloc.h src
 cp $1/libspeexdsp/fixed_generic.h src
 cp $1/include/speex/speex_resampler.h src
 cp $1/AUTHORS .
 cp $1/COPYING .
 
 # apply outstanding local patches
 patch -p3 < outside-speex.patch
-patch -p3 < sse-detect-runtime.patch
+patch -p3 < simd-detect-runtime.patch
 patch -p3 < set-skip-frac.patch
 patch -p3 < hugemem.patch