b=1042504 update speex resampler to speexdsp 305e54ea r=padenot
authorKarl Tomlinson <karlt+@karlt.net>
Wed, 23 Jul 2014 16:58:45 +1200
changeset 195799 0ca7ce33b2b2e7de72da140bbd748ee157574cb2
parent 195798 f8cd15fa98249662c4541a1fdf588a3cace2b67a
child 195800 ebb543336e85e32bc56a895d5518ebaa15d9d1c3
push id27197
push usercbook@mozilla.com
push dateThu, 24 Jul 2014 13:25:44 +0000
treeherdermozilla-central@340ff53a5467 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerspadenot
bugs1042504
milestone34.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
b=1042504 update speex resampler to speexdsp 305e54ea r=padenot Upstream SATURATE32PSHR changes conflict with Gecko's SSE runtime detection patch. That patch is updated to reduce deviation from upstream.
media/libspeex_resampler/README_MOZILLA
media/libspeex_resampler/src/arch.h
media/libspeex_resampler/src/fixed_generic.h
media/libspeex_resampler/src/resample.c
media/libspeex_resampler/sse-detect-runtime.patch
media/libspeex_resampler/update.sh
--- a/media/libspeex_resampler/README_MOZILLA
+++ b/media/libspeex_resampler/README_MOZILLA
@@ -1,5 +1,5 @@
 This source is from the Speex DSP library
-(http://git.xiph.org/?p=speexdsp.git), from commit bbe7e099.
+(http://git.xiph.org/?p=speexdsp.git), from commit 305e54ea.
 
 It consists in the audio resampling code (resampler.c) and its header files
 dependancies, imported into the tree using the update.sh script.
--- a/media/libspeex_resampler/src/arch.h
+++ b/media/libspeex_resampler/src/arch.h
@@ -158,16 +158,17 @@ typedef float spx_word32_t;
 #define SHL16(a,shift) (a)
 #define SHR32(a,shift) (a)
 #define SHL32(a,shift) (a)
 #define PSHR16(a,shift) (a)
 #define PSHR32(a,shift) (a)
 #define VSHR32(a,shift) (a)
 #define SATURATE16(x,a) (x)
 #define SATURATE32(x,a) (x)
+#define SATURATE32PSHR(x,shift,a) (x)
 
 #define PSHR(a,shift)       (a)
 #define SHR(a,shift)       (a)
 #define SHL(a,shift)       (a)
 #define SATURATE(x,a) (x)
 
 #define ADD16(a,b) ((a)+(b))
 #define SUB16(a,b) ((a)-(b))
--- a/media/libspeex_resampler/src/fixed_generic.h
+++ b/media/libspeex_resampler/src/fixed_generic.h
@@ -47,16 +47,20 @@
 #define SHR32(a,shift) ((a) >> (shift))
 #define SHL32(a,shift) ((a) << (shift))
 #define PSHR16(a,shift) (SHR16((a)+((1<<((shift))>>1)),shift))
 #define PSHR32(a,shift) (SHR32((a)+((EXTEND32(1)<<((shift))>>1)),shift))
 #define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift)))
 #define SATURATE16(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 #define SATURATE32(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 
+#define SATURATE32PSHR(x,shift,a) (((x)>=(SHL32(a,shift))) ? (a) : \
+                                   (x)<=-(SHL32(a,shift)) ? -(a) : \
+                                   (PSHR32(x, shift)))
+
 #define SHR(a,shift) ((a) >> (shift))
 #define SHL(a,shift) ((spx_word32_t)(a) << (shift))
 #define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift))
 #define SATURATE(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 
 
 #define ADD16(a,b) ((spx_word16_t)((spx_word16_t)(a)+(spx_word16_t)(b)))
 #define SUB16(a,b) ((spx_word16_t)(a)-(spx_word16_t)(b))
--- a/media/libspeex_resampler/src/resample.c
+++ b/media/libspeex_resampler/src/resample.c
@@ -74,17 +74,17 @@ static void speex_free (void *ptr) {free
 #include "arch.h"
 #include "os_support.h"
 #endif /* OUTSIDE_SPEEX */
 
 #include "stack_alloc.h"
 #include <math.h>
 
 #ifndef M_PI
-#define M_PI 3.14159263
+#define M_PI 3.14159265358979323846
 #endif
 
 #ifdef FIXED_POINT
 #define WORD2INT(x) ((x) < -32767 ? -32768 : ((x) > 32766 ? 32767 : (x)))  
 #else
 #define WORD2INT(x) ((x) < -32767.5f ? -32768 : ((x) > 32766.5f ? 32767 : floor(.5+(x))))  
 #endif
                
@@ -104,16 +104,20 @@ static void speex_free (void *ptr) {free
 #define inline __inline
 #endif
 #include "resample_sse.h"
 #ifdef _MSC_VER
 #undef inline
 #endif
 #endif
 
+#ifdef _USE_NEON
+#include "resample_neon.h"
+#endif
+
 /* Numer of elements to allocate on the stack */
 #ifdef VAR_ARRAYS
 #define FIXED_STACK_ALLOC 8192
 #else
 #define FIXED_STACK_ALLOC 1024
 #endif
 
 typedef int (*resampler_basic_func)(SpeexResamplerState *, spx_uint32_t , const spx_word16_t *, spx_uint32_t *, spx_word16_t *, spx_uint32_t *);
@@ -349,19 +353,17 @@ static int resampler_basic_direct_single
    spx_word32_t sum;
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
 #ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-    if (moz_has_sse()) {
-      sum = inner_product_single(sinct, iptr, N);
-    } else {
+      if (!moz_has_sse()) {
 #endif
       int j;
       sum = 0;
       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
 
 /*    This code is slower on most DSPs which have only 2 accumulators.
       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
       I think we can trust the compiler and let it vectorize and/or unroll itself.
@@ -369,21 +371,24 @@ static int resampler_basic_direct_single
       for(j=0;j<N;j+=4) {
         accum[0] += MULT16_16(sinct[j], iptr[j]);
         accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
         accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
         accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
       }
       sum = accum[0] + accum[1] + accum[2] + accum[3];
 */
+      sum = SATURATE32PSHR(sum, 15, 32767);
 #ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-    }
+      } else {
+      sum = inner_product_single(sinct, iptr, N);
+      }
 #endif
 
-      out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
+      out[out_stride * out_sample++] = sum;
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }
    }
@@ -411,30 +416,30 @@ static int resampler_basic_direct_double
 
    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    {
       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
       const spx_word16_t *iptr = & in[last_sample];
 
 #ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
       if(moz_has_sse2()) {
-        sum = inner_product_double(sinct, iptr, N);
-      } else {
 #endif
-        int j;
-        double accum[4] = {0,0,0,0};
+      int j;
+      double accum[4] = {0,0,0,0};
 
-        for(j=0;j<N;j+=4) {
-          accum[0] += sinct[j]*iptr[j];
-          accum[1] += sinct[j+1]*iptr[j+1];
-          accum[2] += sinct[j+2]*iptr[j+2];
-          accum[3] += sinct[j+3]*iptr[j+3];
-        }
-        sum = accum[0] + accum[1] + accum[2] + accum[3];
+      for(j=0;j<N;j+=4) {
+        accum[0] += sinct[j]*iptr[j];
+        accum[1] += sinct[j+1]*iptr[j+1];
+        accum[2] += sinct[j+2]*iptr[j+2];
+        accum[3] += sinct[j+3]*iptr[j+3];
+      }
+      sum = accum[0] + accum[1] + accum[2] + accum[3];
 #ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+      } else {
+      sum = inner_product_double(sinct, iptr, N);
       }
 #endif
 
       out[out_stride * out_sample++] = PSHR32(sum, 15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
@@ -470,38 +475,40 @@ static int resampler_basic_interpolate_s
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-      if (moz_has_sse()) {
-        cubic_coef(frac, interp);
-        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-      } else {
+      if (!moz_has_sse()) {
 #endif
-        int j;
-        spx_word32_t accum[4] = {0,0,0,0};
+      int j;
+      spx_word32_t accum[4] = {0,0,0,0};
 
-        for(j=0;j<N;j++) {
-          const spx_word16_t curr_in=iptr[j];
-          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
-          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
-          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
-          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
-        }
-        cubic_coef(frac, interp);
-        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+      for(j=0;j<N;j++) {
+        const spx_word16_t curr_in=iptr[j];
+        accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+        accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+        accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+        accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+      }
+
+      cubic_coef(frac, interp);
+      sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+      sum = SATURATE32PSHR(sum, 15, 32767);
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+      } else {
+      cubic_coef(frac, interp);
+      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
       }
 #endif
-
-      out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
+      
+      out[out_stride * out_sample++] = sum;
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }
    }
@@ -535,37 +542,38 @@ static int resampler_basic_interpolate_d
       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 #else
       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 #endif
       spx_word16_t interp[4];
 
 
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-      if (moz_has_sse2()) {
-        cubic_coef(frac, interp);
-        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-      } else {
+      if (!moz_has_sse2()) {
 #endif
       int j;
       double accum[4] = {0,0,0,0};
 
       for(j=0;j<N;j++) {
         const double curr_in=iptr[j];
         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
       }
 
       cubic_coef(frac, interp);
       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
-#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+      } else {
+      cubic_coef(frac, interp);
+      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
       }
 #endif
+      
       out[out_stride * out_sample++] = PSHR32(sum,15);
       last_sample += int_advance;
       samp_frac_num += frac_advance;
       if (samp_frac_num >= den_rate)
       {
          samp_frac_num -= den_rate;
          last_sample++;
       }
@@ -574,19 +582,20 @@ static int resampler_basic_interpolate_d
    st->last_sample[channel_index] = last_sample;
    st->samp_frac_num[channel_index] = samp_frac_num;
    return out_sample;
 }
 #endif
 
 static void update_filter(SpeexResamplerState *st)
 {
-   spx_uint32_t old_length;
-   
-   old_length = st->filt_len;
+   spx_uint32_t old_length = st->filt_len;
+   spx_uint32_t old_alloc_size = st->mem_alloc_size;
+   spx_uint32_t min_alloc_size;
+
    st->oversample = quality_map[st->quality].oversample;
    st->filt_len = quality_map[st->quality].base_length;
    
    if (st->num_rate > st->den_rate)
    {
       /* down-sampling */
       st->cutoff = quality_map[st->quality].downsample_bandwidth * st->den_rate / st->num_rate;
       /* FIXME: divide the numerator and denominator by a certain amount if they're too large */
@@ -604,22 +613,24 @@ static void update_filter(SpeexResampler
       if (st->oversample < 1)
          st->oversample = 1;
    } else {
       /* up-sampling */
       st->cutoff = quality_map[st->quality].upsample_bandwidth;
    }
    
    /* Choose the resampling type that requires the least amount of memory */
+#ifdef RESAMPLE_FULL_SINC_TABLE
+   if (1)
+#else
    if (st->filt_len*st->den_rate <= st->filt_len*st->oversample+8)
+#endif
    {
       spx_uint32_t i;
-      if (!st->sinc_table)
-         st->sinc_table = (spx_word16_t *)speex_alloc(st->filt_len*st->den_rate*sizeof(spx_word16_t));
-      else if (st->sinc_table_length < st->filt_len*st->den_rate)
+      if (st->sinc_table_length < st->filt_len*st->den_rate)
       {
          st->sinc_table = (spx_word16_t *)speex_realloc(st->sinc_table,st->filt_len*st->den_rate*sizeof(spx_word16_t));
          st->sinc_table_length = st->filt_len*st->den_rate;
       }
       for (i=0;i<st->den_rate;i++)
       {
          spx_int32_t j;
          for (j=0;j<st->filt_len;j++)
@@ -633,19 +644,17 @@ static void update_filter(SpeexResampler
       if (st->quality>8)
          st->resampler_ptr = resampler_basic_direct_double;
       else
          st->resampler_ptr = resampler_basic_direct_single;
 #endif
       /*fprintf (stderr, "resampler uses direct sinc table and normalised cutoff %f\n", cutoff);*/
    } else {
       spx_int32_t i;
-      if (!st->sinc_table)
-         st->sinc_table = (spx_word16_t *)speex_alloc((st->filt_len*st->oversample+8)*sizeof(spx_word16_t));
-      else if (st->sinc_table_length < st->filt_len*st->oversample+8)
+      if (st->sinc_table_length < st->filt_len*st->oversample+8)
       {
          st->sinc_table = (spx_word16_t *)speex_realloc(st->sinc_table,(st->filt_len*st->oversample+8)*sizeof(spx_word16_t));
          st->sinc_table_length = st->filt_len*st->oversample+8;
       }
       for (i=-4;i<(spx_int32_t)(st->oversample*st->filt_len+4);i++)
          st->sinc_table[i+4] = sinc(st->cutoff,(i/(float)st->oversample - st->filt_len/2), st->filt_len, quality_map[st->quality].window_func);
 #ifdef FIXED_POINT
       st->resampler_ptr = resampler_basic_interpolate_single;
@@ -659,54 +668,44 @@ static void update_filter(SpeexResampler
    }
    st->int_advance = st->num_rate/st->den_rate;
    st->frac_advance = st->num_rate%st->den_rate;
 
    
    /* Here's the place where we update the filter memory to take into account
       the change in filter length. It's probably the messiest part of the code
       due to handling of lots of corner cases. */
-   if (!st->mem)
+   min_alloc_size = st->filt_len-1 + st->buffer_size;
+   if (min_alloc_size > st->mem_alloc_size)
+   {
+      st->mem = (spx_word16_t*)speex_realloc(st->mem, st->nb_channels*min_alloc_size * sizeof(spx_word16_t));
+      st->mem_alloc_size = min_alloc_size;
+   }
+   if (!st->started)
    {
       spx_uint32_t i;
-      st->mem_alloc_size = st->filt_len-1 + st->buffer_size;
-      st->mem = (spx_word16_t*)speex_alloc(st->nb_channels*st->mem_alloc_size * sizeof(spx_word16_t));
-      for (i=0;i<st->nb_channels*st->mem_alloc_size;i++)
-         st->mem[i] = 0;
-      /*speex_warning("init filter");*/
-   } else if (!st->started)
-   {
-      spx_uint32_t i;
-      st->mem_alloc_size = st->filt_len-1 + st->buffer_size;
-      st->mem = (spx_word16_t*)speex_realloc(st->mem, st->nb_channels*st->mem_alloc_size * sizeof(spx_word16_t));
       for (i=0;i<st->nb_channels*st->mem_alloc_size;i++)
          st->mem[i] = 0;
       /*speex_warning("reinit filter");*/
    } else if (st->filt_len > old_length)
    {
-      spx_int32_t i;
+      spx_uint32_t i;
       /* Increase the filter length */
       /*speex_warning("increase filter size");*/
-      int old_alloc_size = st->mem_alloc_size;
-      if ((st->filt_len-1 + st->buffer_size) > st->mem_alloc_size)
+      for (i=st->nb_channels;i--;)
       {
-         st->mem_alloc_size = st->filt_len-1 + st->buffer_size;
-         st->mem = (spx_word16_t*)speex_realloc(st->mem, st->nb_channels*st->mem_alloc_size * sizeof(spx_word16_t));
-      }
-      for (i=st->nb_channels-1;i>=0;i--)
-      {
-         spx_int32_t j;
+         spx_uint32_t j;
          spx_uint32_t olen = old_length;
          /*if (st->magic_samples[i])*/
          {
             /* Try and remove the magic samples as if nothing had happened */
             
             /* FIXME: This is wrong but for now we need it to avoid going over the array bounds */
             olen = old_length + 2*st->magic_samples[i];
-            for (j=old_length-2+st->magic_samples[i];j>=0;j--)
+            for (j=old_length-1+st->magic_samples[i];j--;)
                st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]] = st->mem[i*old_alloc_size+j];
             for (j=0;j<st->magic_samples[i];j++)
                st->mem[i*st->mem_alloc_size+j] = 0;
             st->magic_samples[i] = 0;
          }
          if (st->filt_len > olen)
          {
             /* If the new filter length is still bigger than the "augmented" length */
--- a/media/libspeex_resampler/sse-detect-runtime.patch
+++ b/media/libspeex_resampler/sse-detect-runtime.patch
@@ -1,10 +1,11 @@
---- a/src/resample.c	2014-07-01 17:25:53.999320032 +1200
-+++ b/src/resample.c	2014-07-01 17:42:18.822611775 +1200
+diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
+--- a/media/libspeex_resampler/src/resample.c
++++ b/media/libspeex_resampler/src/resample.c
 @@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
                 
  #define IMAX(a,b) ((a) > (b) ? (a) : (b))
  #define IMIN(a,b) ((a) < (b) ? (a) : (b))
  
  #ifndef NULL
  #define NULL 0
  #endif
@@ -18,203 +19,174 @@
 +#define inline __inline
 +#endif
  #include "resample_sse.h"
 +#ifdef _MSC_VER
 +#undef inline
 +#endif
  #endif
  
+ #ifdef _USE_NEON
+ #include "resample_neon.h"
+ #endif
+ 
  /* Numer of elements to allocate on the stack */
  #ifdef VAR_ARRAYS
- #define FIXED_STACK_ALLOC 8192
- #else
- #define FIXED_STACK_ALLOC 1024
- #endif
-@@ -338,35 +348,39 @@ static int resampler_basic_direct_single
+@@ -342,17 +352,19 @@ static int resampler_basic_direct_single
     const spx_uint32_t den_rate = st->den_rate;
     spx_word32_t sum;
  
     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
     {
        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
        const spx_word16_t *iptr = & in[last_sample];
  
 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-+    if (moz_has_sse()) {
-+      sum = inner_product_single(sinct, iptr, N);
-+    } else {
++      if (!moz_has_sse()) {
 +#endif
        int j;
        sum = 0;
        for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
  
  /*    This code is slower on most DSPs which have only 2 accumulators.
        Plus this this forces truncation to 32 bits and you lose the HW guard bits.
        I think we can trust the compiler and let it vectorize and/or unroll itself.
        spx_word32_t accum[4] = {0,0,0,0};
-       for(j=0;j<N;j+=4) {
+@@ -360,18 +372,20 @@ static int resampler_basic_direct_single
          accum[0] += MULT16_16(sinct[j], iptr[j]);
          accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
          accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
          accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
        }
        sum = accum[0] + accum[1] + accum[2] + accum[3];
  */
+       sum = SATURATE32PSHR(sum, 15, 32767);
 -#else
--      sum = inner_product_single(sinct, iptr, N);
 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-+    }
++      } else {
+       sum = inner_product_single(sinct, iptr, N);
++      }
  #endif
  
-       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
+       out[out_stride * out_sample++] = sum;
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-@@ -395,29 +409,33 @@ static int resampler_basic_direct_double
+@@ -400,29 +414,33 @@ static int resampler_basic_direct_double
     const spx_uint32_t den_rate = st->den_rate;
     double sum;
  
     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
     {
        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
        const spx_word16_t *iptr = & in[last_sample];
  
 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
--      int j;
--      double accum[4] = {0,0,0,0};
--
--      for(j=0;j<N;j+=4) {
--        accum[0] += sinct[j]*iptr[j];
--        accum[1] += sinct[j+1]*iptr[j+1];
--        accum[2] += sinct[j+2]*iptr[j+2];
--        accum[3] += sinct[j+3]*iptr[j+3];
 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
 +      if(moz_has_sse2()) {
-+        sum = inner_product_double(sinct, iptr, N);
++#endif
+       int j;
+       double accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j+=4) {
+         accum[0] += sinct[j]*iptr[j];
+         accum[1] += sinct[j+1]*iptr[j+1];
+         accum[2] += sinct[j+2]*iptr[j+2];
+         accum[3] += sinct[j+3]*iptr[j+3];
+       }
+       sum = accum[0] + accum[1] + accum[2] + accum[3];
+-#else
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
 +      } else {
-+#endif
-+        int j;
-+        double accum[4] = {0,0,0,0};
-+
-+        for(j=0;j<N;j+=4) {
-+          accum[0] += sinct[j]*iptr[j];
-+          accum[1] += sinct[j+1]*iptr[j+1];
-+          accum[2] += sinct[j+2]*iptr[j+2];
-+          accum[3] += sinct[j+3]*iptr[j+3];
-+        }
-+        sum = accum[0] + accum[1] + accum[2] + accum[3];
-+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
-       }
--      sum = accum[0] + accum[1] + accum[2] + accum[3];
--#else
--      sum = inner_product_double(sinct, iptr, N);
+       sum = inner_product_double(sinct, iptr, N);
++      }
  #endif
  
        out[out_stride * out_sample++] = PSHR32(sum, 15);
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-@@ -451,35 +469,38 @@ static int resampler_basic_interpolate_s
+@@ -456,34 +474,38 @@ static int resampler_basic_interpolate_s
  #ifdef FIXED_POINT
        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
  #else
        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
  #endif
        spx_word16_t interp[4];
  
  
 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
--      int j;
--      spx_word32_t accum[4] = {0,0,0,0};
--
--      for(j=0;j<N;j++) {
--        const spx_word16_t curr_in=iptr[j];
--        accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
--        accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
--        accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
--        accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-+      if (moz_has_sse()) {
-+        cubic_coef(frac, interp);
-+        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-+      } else {
++      if (!moz_has_sse()) {
 +#endif
-+        int j;
-+        spx_word32_t accum[4] = {0,0,0,0};
-+
-+        for(j=0;j<N;j++) {
-+          const spx_word16_t curr_in=iptr[j];
-+          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
-+          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
-+          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
-+          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
-+        }
-+        cubic_coef(frac, interp);
-+        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+       int j;
+       spx_word32_t accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j++) {
+         const spx_word16_t curr_in=iptr[j];
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+       }
+ 
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+       sum = SATURATE32PSHR(sum, 15, 32767);
+-#else
 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-       }
--
--      cubic_coef(frac, interp);
--      sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
--#else
--      cubic_coef(frac, interp);
--      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      } else {
+       cubic_coef(frac, interp);
+       sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      }
  #endif
--      
-+
-       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
+       
+       out[out_stride * out_sample++] = sum;
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-          last_sample++;
-       }
-@@ -513,35 +534,38 @@ static int resampler_basic_interpolate_d
+@@ -519,33 +541,37 @@ static int resampler_basic_interpolate_d
  #ifdef FIXED_POINT
        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
  #else
        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
  #endif
        spx_word16_t interp[4];
  
  
 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-+      if (moz_has_sse2()) {
-+        cubic_coef(frac, interp);
-+        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-+      } else {
++      if (!moz_has_sse2()) {
 +#endif
        int j;
        double accum[4] = {0,0,0,0};
  
        for(j=0;j<N;j++) {
          const double curr_in=iptr[j];
          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
        }
  
        cubic_coef(frac, interp);
        sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
 -#else
--      cubic_coef(frac, interp);
--      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++      } else {
+       cubic_coef(frac, interp);
+       sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 +      }
  #endif
--      
+       
        out[out_stride * out_sample++] = PSHR32(sum,15);
        last_sample += int_advance;
        samp_frac_num += frac_advance;
        if (samp_frac_num >= den_rate)
        {
           samp_frac_num -= den_rate;
-          last_sample++;
-       }
--- a/media/libspeex_resampler/update.sh
+++ b/media/libspeex_resampler/update.sh
@@ -15,10 +15,10 @@ cp $1/libspeexdsp/arch.h src
 cp $1/libspeexdsp/stack_alloc.h src
 cp $1/libspeexdsp/fixed_generic.h src
 cp $1/include/speex/speex_resampler.h src
 cp $1/AUTHORS .
 cp $1/COPYING .
 
 # apply outstanding local patches
 patch -p3 < outside-speex.patch
-patch -p1 < sse-detect-runtime.patch
+patch -p3 < sse-detect-runtime.patch
 patch -p3 < set-skip-frac.patch