Bug 894941 - Patch the speex resampler to do runtime checks for SSE. r=ehsan,glandium
authorPaul Adenot <paul@paul.cx>
Fri, 26 Jul 2013 18:46:32 +0200
changeset 140179 adeb3608cd4114359cc0eff753f704ed578deb11
parent 140178 f134b528c97d9f10ae10b0f2702f4cb66c95cec4
child 140180 2aa33cb78665591b5244c36b111b67507af0e293
push id25016
push userryanvm@gmail.com
push dateSat, 27 Jul 2013 02:25:56 +0000
treeherdermozilla-central@fb48c7d58b8b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersehsan, glandium
bugs894941
milestone25.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 894941 - Patch the speex resampler to do runtime checks for SSE. r=ehsan,glandium
media/libspeex_resampler/src/Makefile.in
media/libspeex_resampler/src/moz.build
media/libspeex_resampler/src/resample.c
media/libspeex_resampler/src/sse_detect.cpp
media/libspeex_resampler/src/sse_detect.h
media/libspeex_resampler/sse-detect-runtime.patch
media/libspeex_resampler/update.sh
--- a/media/libspeex_resampler/src/Makefile.in
+++ b/media/libspeex_resampler/src/Makefile.in
@@ -26,8 +26,18 @@ else
 DEFINES += -DFLOATING_POINT
 endif
 
 CSRCS = \
     resample.c \
     $(NULL)
 
 include $(topsrcdir)/config/rules.mk
+
+# Only use SSE code when using floating point samples, and on x86
+ifneq (,$(INTEL_ARCHITECTURE))
+ifneq ($(OS_TARGET),Android)
+DEFINES += -D_USE_SSE -D_USE_SSE2
+ifdef GNU_CC
+resample.$(OBJ_SUFFIX): CFLAGS+=-msse2
+endif
+endif
+endif
--- a/media/libspeex_resampler/src/moz.build
+++ b/media/libspeex_resampler/src/moz.build
@@ -7,10 +7,14 @@
 MODULE = 'speex_resampler'
 
 EXPORTS.speex += [
     'speex_config_types.h',
     'speex_resampler.h',
     'speex_types.h',
 ]
 
+CPP_SOURCES += [
+  'sse_detect.cpp',
+]
+
 LIBRARY_NAME = 'speex_resampler'
 
--- a/media/libspeex_resampler/src/resample.c
+++ b/media/libspeex_resampler/src/resample.c
@@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
                
 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
 
 #ifndef NULL
 #define NULL 0
 #endif
 
+#include "sse_detect.h"
+
+/* We compile SSE code on x86 all the time, but we only use it if we find at
+ * runtime that the CPU supports it. */
 #ifdef _USE_SSE
+#ifdef _MSC_VER
+#define inline __inline
+#endif
 #include "resample_sse.h"
+#ifdef _MSC_VER
+#undef inline
+#endif
 #endif
 
 /* Numer of elements to allocate on the stack */
 #ifdef VAR_ARRAYS
 #define FIXED_STACK_ALLOC 8192
 #else
 #define FIXED_STACK_ALLOC 1024
 #endif
@@ -339,34 +349,37 @@ static int resampler_basic_direct_single
    spx_word32_t sum;
    int j;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    if (moz_has_sse()) {
+      sum = inner_product_single(sinc, iptr, N);
+    } else {
+#endif
       sum = 0;
       for(j=0;j<N;j++) sum += MULT16_16(sinc[j], iptr[j]);
-
 /*    This code is slower on most DSPs which have only 2 accumulators.
       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
       I think we can trust the compiler and let it vectorize and/or unroll itself.
       spx_word32_t accum[4] = {0,0,0,0};
       for(j=0;j<N;j+=4) {
         accum[0] += MULT16_16(sinc[j], iptr[j]);
         accum[1] += MULT16_16(sinc[j+1], iptr[j+1]);
         accum[2] += MULT16_16(sinc[j+2], iptr[j+2]);
         accum[3] += MULT16_16(sinc[j+3], iptr[j+3]);
       }
       sum = accum[0] + accum[1] + accum[2] + accum[3];
 */
-#else
-      sum = inner_product_single(sinc, iptr, N);
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    }
 #endif
 
       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
@@ -396,28 +409,32 @@ static int resampler_basic_direct_double
    double sum;
    int j;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
-      double accum[4] = {0,0,0,0};
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      if(moz_has_sse2()) {
+        sum = inner_product_double(sinc, iptr, N);
+      } else {
+#endif
+        double accum[4] = {0,0,0,0};
 
-      for(j=0;j<N;j+=4) {
-        accum[0] += sinc[j]*iptr[j];
-        accum[1] += sinc[j+1]*iptr[j+1];
-        accum[2] += sinc[j+2]*iptr[j+2];
-        accum[3] += sinc[j+3]*iptr[j+3];
+        for(j=0;j<N;j+=4) {
+          accum[0] += sinc[j]*iptr[j];
+          accum[1] += sinc[j+1]*iptr[j+1];
+          accum[2] += sinc[j+2]*iptr[j+2];
+          accum[3] += sinc[j+3]*iptr[j+3];
+        }
+        sum = accum[0] + accum[1] + accum[2] + accum[3];
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
       }
-      sum = accum[0] + accum[1] + accum[2] + accum[3];
-#else
-      sum = inner_product_double(sinc, iptr, N);
 #endif
 
       out[out_stride * out_sample++] = PSHR32(sum, 15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
@@ -452,34 +469,37 @@ static int resampler_basic_interpolate_s
 #ifdef FIXED_POINT
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+      if (moz_has_sse()) {
+        cubic_coef(frac, interp);
+        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+      } else {
+#endif
+
       spx_word32_t accum[4] = {0,0,0,0};
-
       for(j=0;j<N;j++) {
         const spx_word16_t curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
       }
-
       cubic_coef(frac, interp);
       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
-#else
-      cubic_coef(frac, interp);
-      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+      }
 #endif
-      
+
       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }
@@ -514,32 +534,36 @@ static int resampler_basic_interpolate_d
 #ifdef FIXED_POINT
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+      if (moz_has_sse2()) {
+        cubic_coef(frac, interp);
+        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+      } else {
+#endif
       double accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j++) {
         const double curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
       }
 
       cubic_coef(frac, interp);
       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
-#else
-      cubic_coef(frac, interp);
-      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      }
 #endif
       
       out[out_stride * out_sample++] = PSHR32(sum,15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
new file mode 100644
--- /dev/null
+++ b/media/libspeex_resampler/src/sse_detect.cpp
@@ -0,0 +1,15 @@
+/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/SSE.h"
+#include "sse_detect.h"
+
+int moz_has_sse2() {
+  return mozilla::supports_sse2() ? 1 : 0;
+}
+
+int moz_has_sse() {
+  return mozilla::supports_sse() ? 1 : 0;
+}
new file mode 100644
--- /dev/null
+++ b/media/libspeex_resampler/src/sse_detect.h
@@ -0,0 +1,20 @@
+/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef SSE_DETECT
+#define SSE_DETECT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  int moz_has_sse2();
+  int moz_has_sse();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // SSE_DETECT
new file mode 100644
--- /dev/null
+++ b/media/libspeex_resampler/sse-detect-runtime.patch
@@ -0,0 +1,143 @@
+diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
+--- a/src/resample.c
++++ b/src/resample.c
+@@ -95,8 +95,18 @@ static void speex_free (void *ptr) {free
+ #define NULL 0
+ #endif
+ 
++#include "sse_detect.h"
++
++/* We compile SSE code on x86 all the time, but we only use it if we find at
++ * runtime that the CPU supports it. */
+ #ifdef _USE_SSE
++#ifdef _MSC_VER
++#define inline __inline
++#endif
+ #include "resample_sse.h"
++#ifdef _MSC_VER
++#undef inline
++#endif
+ #endif
+ 
+ /* Numer of elements to allocate on the stack */
+@@ -344,10 +354,13 @@ static int resampler_basic_direct_single
+       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
+       const spx_word16_t *iptr = & in[last_sample];
+ 
+-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
++    if (moz_has_sse()) {
++      sum = inner_product_single(sinc, iptr, N);
++    } else {
++#endif
+       sum = 0;
+       for(j=0;j<N;j++) sum += MULT16_16(sinc[j], iptr[j]);
+-
+ /*    This code is slower on most DSPs which have only 2 accumulators.
+       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
+       I think we can trust the compiler and let it vectorize and/or unroll itself.
+@@ -360,8 +373,8 @@ static int resampler_basic_direct_single
+       }
+       sum = accum[0] + accum[1] + accum[2] + accum[3];
+ */
+-#else
+-      sum = inner_product_single(sinc, iptr, N);
++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
++    }
+ #endif
+ 
+       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
+@@ -401,18 +414,22 @@ static int resampler_basic_direct_double
+       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
+       const spx_word16_t *iptr = & in[last_sample];
+ 
+-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
+-      double accum[4] = {0,0,0,0};
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++      if(moz_has_sse2()) {
++        sum = inner_product_double(sinc, iptr, N);
++      } else {
++#endif
++        double accum[4] = {0,0,0,0};
+ 
+-      for(j=0;j<N;j+=4) {
+-        accum[0] += sinc[j]*iptr[j];
+-        accum[1] += sinc[j+1]*iptr[j+1];
+-        accum[2] += sinc[j+2]*iptr[j+2];
+-        accum[3] += sinc[j+3]*iptr[j+3];
++        for(j=0;j<N;j+=4) {
++          accum[0] += sinc[j]*iptr[j];
++          accum[1] += sinc[j+1]*iptr[j+1];
++          accum[2] += sinc[j+2]*iptr[j+2];
++          accum[3] += sinc[j+3]*iptr[j+3];
++        }
++        sum = accum[0] + accum[1] + accum[2] + accum[3];
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+       }
+-      sum = accum[0] + accum[1] + accum[2] + accum[3];
+-#else
+-      sum = inner_product_double(sinc, iptr, N);
+ #endif
+ 
+       out[out_stride * out_sample++] = PSHR32(sum, 15);
+@@ -457,9 +474,14 @@ static int resampler_basic_interpolate_s
+       spx_word16_t interp[4];
+ 
+ 
+-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++      if (moz_has_sse()) {
++        cubic_coef(frac, interp);
++        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      } else {
++#endif
++
+       spx_word32_t accum[4] = {0,0,0,0};
+-
+       for(j=0;j<N;j++) {
+         const spx_word16_t curr_in=iptr[j];
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+@@ -467,14 +489,12 @@ static int resampler_basic_interpolate_s
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+       }
+-
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+-#else
+-      cubic_coef(frac, interp);
+-      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++      }
+ #endif
+-      
++
+       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
+       last_sample += int_advance;
+       samp_frac_num += frac_advance;
+@@ -519,7 +539,12 @@ static int resampler_basic_interpolate_d
+       spx_word16_t interp[4];
+ 
+ 
+-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++      if (moz_has_sse2()) {
++        cubic_coef(frac, interp);
++        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      } else {
++#endif
+       double accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j++) {
+@@ -532,9 +557,8 @@ static int resampler_basic_interpolate_d
+ 
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
+-#else
+-      cubic_coef(frac, interp);
+-      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++      }
+ #endif
+       
+       out[out_stride * out_sample++] = PSHR32(sum,15);
--- a/media/libspeex_resampler/update.sh
+++ b/media/libspeex_resampler/update.sh
@@ -15,8 +15,9 @@ cp $1/libspeex/fixed_generic.h src
 cp $1/include/speex/speex_resampler.h src
 cp $1/include/speex/speex_types.h src
 sed -e 's/unsigned @SIZE16@/uint16_t/g' -e 's/unsigned @SIZE32@/uint32_t/g' -e 's/@SIZE16@/int16_t/g' -e 's/@SIZE32@/int32_t/g' < $1/include/speex/speex_config_types.h.in > src/speex_config_types.h
 cp $1/AUTHORS .
 cp $1/COPYING .
 
 # apply outstanding local patches
 patch -p1 < truncation.patch
+patch -p1 < sse-detect-runtime.patch