media/libspeex_resampler/sse-detect-runtime.patch
author Karl Tomlinson <karlt+@karlt.net>
Wed, 23 Jul 2014 16:56:25 +1200
changeset 195798 f8cd15fa98249662c4541a1fdf588a3cace2b67a
parent 142395 65485a61c3a9faeb70ee97e60bc370cff9bb68a9
child 195799 0ca7ce33b2b2e7de72da140bbd748ee157574cb2
permissions -rw-r--r--
b=1042504 switch speex resampler to speexdsp bbe7e099 r=padenot This is the speexdsp revision most similar to what is currently in Gecko. We'll want to keep the HUGEMEM variant that we currently have from opus-tools, but that will be restored in a subsequent patch.

--- a/src/resample.c	2014-07-01 17:25:53.999320032 +1200
+++ b/src/resample.c	2014-07-01 17:42:18.822611775 +1200
@@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
                
 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
 
 #ifndef NULL
 #define NULL 0
 #endif
 
+#include "sse_detect.h"
+
+/* We compile SSE code on x86 all the time, but we only use it if we find at
+ * runtime that the CPU supports it. */
 #ifdef _USE_SSE
+#ifdef _MSC_VER
+#define inline __inline
+#endif
 #include "resample_sse.h"
+#ifdef _MSC_VER
+#undef inline
+#endif
 #endif
 
 /* Numer of elements to allocate on the stack */
 #ifdef VAR_ARRAYS
 #define FIXED_STACK_ALLOC 8192
 #else
 #define FIXED_STACK_ALLOC 1024
 #endif
@@ -338,35 +348,39 @@ static int resampler_basic_direct_single
    const spx_uint32_t den_rate = st->den_rate;
    spx_word32_t sum;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    if (moz_has_sse()) {
+      sum = inner_product_single(sinct, iptr, N);
+    } else {
+#endif
       int j;
       sum = 0;
       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
 
 /*    This code is slower on most DSPs which have only 2 accumulators.
       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
       I think we can trust the compiler and let it vectorize and/or unroll itself.
       spx_word32_t accum[4] = {0,0,0,0};
       for(j=0;j<N;j+=4) {
         accum[0] += MULT16_16(sinct[j], iptr[j]);
         accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
         accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
         accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
       }
       sum = accum[0] + accum[1] + accum[2] + accum[3];
 */
-#else
-      sum = inner_product_single(sinct, iptr, N);
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    }
 #endif
 
       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
@@ -395,29 +409,33 @@ static int resampler_basic_direct_double
    const spx_uint32_t den_rate = st->den_rate;
    double sum;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
-      int j;
-      double accum[4] = {0,0,0,0};
-
-      for(j=0;j<N;j+=4) {
-        accum[0] += sinct[j]*iptr[j];
-        accum[1] += sinct[j+1]*iptr[j+1];
-        accum[2] += sinct[j+2]*iptr[j+2];
-        accum[3] += sinct[j+3]*iptr[j+3];
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      if(moz_has_sse2()) {
+        sum = inner_product_double(sinct, iptr, N);
+      } else {
+#endif
+        int j;
+        double accum[4] = {0,0,0,0};
+
+        for(j=0;j<N;j+=4) {
+          accum[0] += sinct[j]*iptr[j];
+          accum[1] += sinct[j+1]*iptr[j+1];
+          accum[2] += sinct[j+2]*iptr[j+2];
+          accum[3] += sinct[j+3]*iptr[j+3];
+        }
+        sum = accum[0] + accum[1] + accum[2] + accum[3];
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
       }
-      sum = accum[0] + accum[1] + accum[2] + accum[3];
-#else
-      sum = inner_product_double(sinct, iptr, N);
 #endif
 
       out[out_stride * out_sample++] = PSHR32(sum, 15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
@@ -451,35 +469,38 @@ static int resampler_basic_interpolate_s
 #ifdef FIXED_POINT
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-      int j;
-      spx_word32_t accum[4] = {0,0,0,0};
-
-      for(j=0;j<N;j++) {
-        const spx_word16_t curr_in=iptr[j];
-        accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
-        accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
-        accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
-        accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+      if (moz_has_sse()) {
+        cubic_coef(frac, interp);
+        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+      } else {
+#endif
+        int j;
+        spx_word32_t accum[4] = {0,0,0,0};
+
+        for(j=0;j<N;j++) {
+          const spx_word16_t curr_in=iptr[j];
+          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+        }
+        cubic_coef(frac, interp);
+        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
       }
-
-      cubic_coef(frac, interp);
-      sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
-#else
-      cubic_coef(frac, interp);
-      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 #endif
-      
+
       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }
@@ -513,35 +534,38 @@ static int resampler_basic_interpolate_d
 #ifdef FIXED_POINT
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+      if (moz_has_sse2()) {
+        cubic_coef(frac, interp);
+        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+      } else {
+#endif
       int j;
       double accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j++) {
         const double curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
       }
 
       cubic_coef(frac, interp);
       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
-#else
-      cubic_coef(frac, interp);
-      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      }
 #endif
-      
       out[out_stride * out_sample++] = PSHR32(sum,15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }