Bug 894941 - Patch the speex resampler to do runtime checks for SSE. r=ehsan,glandium
authorPaul Adenot <paul@paul.cx>
Fri, 26 Jul 2013 18:46:32 +0200
changeset 140186 adeb3608cd4114359cc0eff753f704ed578deb11
parent 140185 f134b528c97d9f10ae10b0f2702f4cb66c95cec4
child 140187 2aa33cb78665591b5244c36b111b67507af0e293
push id1945
push userryanvm@gmail.com
push dateSat, 27 Jul 2013 02:27:26 +0000
treeherderfx-team@4874fa438b1c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersehsan, glandium
bugs894941
milestone25.0a1
Bug 894941 - Patch the speex resampler to do runtime checks for SSE. r=ehsan,glandium
media/libspeex_resampler/src/Makefile.in
media/libspeex_resampler/src/moz.build
media/libspeex_resampler/src/resample.c
media/libspeex_resampler/src/sse_detect.cpp
media/libspeex_resampler/src/sse_detect.h
media/libspeex_resampler/sse-detect-runtime.patch
media/libspeex_resampler/update.sh
--- a/media/libspeex_resampler/src/Makefile.in
+++ b/media/libspeex_resampler/src/Makefile.in
@@ -26,8 +26,18 @@ else
 DEFINES += -DFLOATING_POINT
 endif
 
 CSRCS = \
     resample.c \
     $(NULL)
 
 include $(topsrcdir)/config/rules.mk
+
+# Only use SSE code when using floating point samples, and on x86
+ifneq (,$(INTEL_ARCHITECTURE))
+ifneq ($(OS_TARGET),Android)
+DEFINES += -D_USE_SSE -D_USE_SSE2
+ifdef GNU_CC
+resample.$(OBJ_SUFFIX): CFLAGS+=-msse2
+endif
+endif
+endif
--- a/media/libspeex_resampler/src/moz.build
+++ b/media/libspeex_resampler/src/moz.build
@@ -7,10 +7,14 @@
 MODULE = 'speex_resampler'
 
 EXPORTS.speex += [
     'speex_config_types.h',
     'speex_resampler.h',
     'speex_types.h',
 ]
 
+CPP_SOURCES += [
+  'sse_detect.cpp',
+]
+
 LIBRARY_NAME = 'speex_resampler'
 
--- a/media/libspeex_resampler/src/resample.c
+++ b/media/libspeex_resampler/src/resample.c
@@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
                
 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
 
 #ifndef NULL
 #define NULL 0
 #endif
 
+#include "sse_detect.h"
+
+/* We compile SSE code on x86 all the time, but we only use it if we find at
+ * runtime that the CPU supports it. */
 #ifdef _USE_SSE
+#ifdef _MSC_VER
+#define inline __inline
+#endif
 #include "resample_sse.h"
+#ifdef _MSC_VER
+#undef inline
+#endif
 #endif
 
 /* Numer of elements to allocate on the stack */
 #ifdef VAR_ARRAYS
 #define FIXED_STACK_ALLOC 8192
 #else
 #define FIXED_STACK_ALLOC 1024
 #endif
@@ -339,34 +349,37 @@ static int resampler_basic_direct_single
    spx_word32_t sum;
    int j;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    if (moz_has_sse()) {
+      sum = inner_product_single(sinc, iptr, N);
+    } else {
+#endif
       sum = 0;
       for(j=0;j<N;j++) sum += MULT16_16(sinc[j], iptr[j]);
-
 /*    This code is slower on most DSPs which have only 2 accumulators.
       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
       I think we can trust the compiler and let it vectorize and/or unroll itself.
       spx_word32_t accum[4] = {0,0,0,0};
       for(j=0;j<N;j+=4) {
         accum[0] += MULT16_16(sinc[j], iptr[j]);
         accum[1] += MULT16_16(sinc[j+1], iptr[j+1]);
         accum[2] += MULT16_16(sinc[j+2], iptr[j+2]);
         accum[3] += MULT16_16(sinc[j+3], iptr[j+3]);
       }
       sum = accum[0] + accum[1] + accum[2] + accum[3];
 */
-#else
-      sum = inner_product_single(sinc, iptr, N);
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    }
 #endif
 
       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
@@ -396,28 +409,32 @@ static int resampler_basic_direct_double
    double sum;
    int j;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
-      double accum[4] = {0,0,0,0};
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      if(moz_has_sse2()) {
+        sum = inner_product_double(sinc, iptr, N);
+      } else {
+#endif
+        double accum[4] = {0,0,0,0};
 
-      for(j=0;j<N;j+=4) {
-        accum[0] += sinc[j]*iptr[j];
-        accum[1] += sinc[j+1]*iptr[j+1];
-        accum[2] += sinc[j+2]*iptr[j+2];
-        accum[3] += sinc[j+3]*iptr[j+3];
+        for(j=0;j<N;j+=4) {
+          accum[0] += sinc[j]*iptr[j];
+          accum[1] += sinc[j+1]*iptr[j+1];
+          accum[2] += sinc[j+2]*iptr[j+2];
+          accum[3] += sinc[j+3]*iptr[j+3];
+        }
+        sum = accum[0] + accum[1] + accum[2] + accum[3];
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
       }
-      sum = accum[0] + accum[1] + accum[2] + accum[3];
-#else
-      sum = inner_product_double(sinc, iptr, N);
 #endif
 
       out[out_stride * out_sample++] = PSHR32(sum, 15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
@@ -452,34 +469,37 @@ static int resampler_basic_interpolate_s
 #ifdef FIXED_POINT
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+      if (moz_has_sse()) {
+        cubic_coef(frac, interp);
+        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+      } else {
+#endif
+
       spx_word32_t accum[4] = {0,0,0,0};
-
       for(j=0;j<N;j++) {
         const spx_word16_t curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
       }
-
       cubic_coef(frac, interp);
       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
-#else
-      cubic_coef(frac, interp);
-      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+      }
 #endif
-      
+
       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }
@@ -514,32 +534,36 @@ static int resampler_basic_interpolate_d
 #ifdef FIXED_POINT
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+      if (moz_has_sse2()) {
+        cubic_coef(frac, interp);
+        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+      } else {
+#endif
       double accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j++) {
         const double curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
       }
 
       cubic_coef(frac, interp);
       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
-#else
-      cubic_coef(frac, interp);
-      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      }
 #endif
       
       out[out_stride * out_sample++] = PSHR32(sum,15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
new file mode 100644
--- /dev/null
+++ b/media/libspeex_resampler/src/sse_detect.cpp
@@ -0,0 +1,15 @@
+/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/SSE.h"
+#include "sse_detect.h"
+
+int moz_has_sse2() {
+  return mozilla::supports_sse2() ? 1 : 0;
+}
+
+int moz_has_sse() {
+  return mozilla::supports_sse() ? 1 : 0;
+}
new file mode 100644
--- /dev/null
+++ b/media/libspeex_resampler/src/sse_detect.h
@@ -0,0 +1,20 @@
+/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef SSE_DETECT
+#define SSE_DETECT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  int moz_has_sse2();
+  int moz_has_sse();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // SSE_DETECT
new file mode 100644
--- /dev/null
+++ b/media/libspeex_resampler/sse-detect-runtime.patch
@@ -0,0 +1,143 @@
+diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
+--- a/src/resample.c
++++ b/src/resample.c
+@@ -95,8 +95,18 @@ static void speex_free (void *ptr) {free
+ #define NULL 0
+ #endif
+ 
++#include "sse_detect.h"
++
++/* We compile SSE code on x86 all the time, but we only use it if we find at
++ * runtime that the CPU supports it. */
+ #ifdef _USE_SSE
++#ifdef _MSC_VER
++#define inline __inline
++#endif
+ #include "resample_sse.h"
++#ifdef _MSC_VER
++#undef inline
++#endif
+ #endif
+ 
+ /* Numer of elements to allocate on the stack */
+@@ -344,10 +354,13 @@ static int resampler_basic_direct_single
+       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
+       const spx_word16_t *iptr = & in[last_sample];
+ 
+-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
++    if (moz_has_sse()) {
++      sum = inner_product_single(sinc, iptr, N);
++    } else {
++#endif
+       sum = 0;
+       for(j=0;j<N;j++) sum += MULT16_16(sinc[j], iptr[j]);
+-
+ /*    This code is slower on most DSPs which have only 2 accumulators.
+       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
+       I think we can trust the compiler and let it vectorize and/or unroll itself.
+@@ -360,8 +373,8 @@ static int resampler_basic_direct_single
+       }
+       sum = accum[0] + accum[1] + accum[2] + accum[3];
+ */
+-#else
+-      sum = inner_product_single(sinc, iptr, N);
++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
++    }
+ #endif
+ 
+       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
+@@ -401,18 +414,22 @@ static int resampler_basic_direct_double
+       const spx_word16_t *sinc = & sinc_table[samp_frac_num*N];
+       const spx_word16_t *iptr = & in[last_sample];
+ 
+-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
+-      double accum[4] = {0,0,0,0};
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++      if(moz_has_sse2()) {
++        sum = inner_product_double(sinc, iptr, N);
++      } else {
++#endif
++        double accum[4] = {0,0,0,0};
+ 
+-      for(j=0;j<N;j+=4) {
+-        accum[0] += sinc[j]*iptr[j];
+-        accum[1] += sinc[j+1]*iptr[j+1];
+-        accum[2] += sinc[j+2]*iptr[j+2];
+-        accum[3] += sinc[j+3]*iptr[j+3];
++        for(j=0;j<N;j+=4) {
++          accum[0] += sinc[j]*iptr[j];
++          accum[1] += sinc[j+1]*iptr[j+1];
++          accum[2] += sinc[j+2]*iptr[j+2];
++          accum[3] += sinc[j+3]*iptr[j+3];
++        }
++        sum = accum[0] + accum[1] + accum[2] + accum[3];
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+       }
+-      sum = accum[0] + accum[1] + accum[2] + accum[3];
+-#else
+-      sum = inner_product_double(sinc, iptr, N);
+ #endif
+ 
+       out[out_stride * out_sample++] = PSHR32(sum, 15);
+@@ -457,9 +474,14 @@ static int resampler_basic_interpolate_s
+       spx_word16_t interp[4];
+ 
+ 
+-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++      if (moz_has_sse()) {
++        cubic_coef(frac, interp);
++        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      } else {
++#endif
++
+       spx_word32_t accum[4] = {0,0,0,0};
+-
+       for(j=0;j<N;j++) {
+         const spx_word16_t curr_in=iptr[j];
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+@@ -467,14 +489,12 @@ static int resampler_basic_interpolate_s
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+       }
+-
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+-#else
+-      cubic_coef(frac, interp);
+-      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++      }
+ #endif
+-      
++
+       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
+       last_sample += int_advance;
+       samp_frac_num += frac_advance;
+@@ -519,7 +539,12 @@ static int resampler_basic_interpolate_d
+       spx_word16_t interp[4];
+ 
+ 
+-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++      if (moz_has_sse2()) {
++        cubic_coef(frac, interp);
++        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      } else {
++#endif
+       double accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j++) {
+@@ -532,9 +557,8 @@ static int resampler_basic_interpolate_d
+ 
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
+-#else
+-      cubic_coef(frac, interp);
+-      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++      }
+ #endif
+       
+       out[out_stride * out_sample++] = PSHR32(sum,15);
--- a/media/libspeex_resampler/update.sh
+++ b/media/libspeex_resampler/update.sh
@@ -15,8 +15,9 @@ cp $1/libspeex/fixed_generic.h src
 cp $1/include/speex/speex_resampler.h src
 cp $1/include/speex/speex_types.h src
 sed -e 's/unsigned @SIZE16@/uint16_t/g' -e 's/unsigned @SIZE32@/uint32_t/g' -e 's/@SIZE16@/int16_t/g' -e 's/@SIZE32@/int32_t/g' < $1/include/speex/speex_config_types.h.in > src/speex_config_types.h
 cp $1/AUTHORS .
 cp $1/COPYING .
 
 # apply outstanding local patches
 patch -p1 < truncation.patch
+patch -p1 < sse-detect-runtime.patch