Bug 881587 - Use SSE2 version of AudioNodeEngine.cpp routines added in bug 815643. r=padenot
authorDan Minor <dminor@mozilla.com>
Thu, 14 Apr 2016 08:57:21 -0400
changeset 331478 b1da60432e4188749a6b961bbb7ed1900433376a
parent 331477 e719cc5de7b797b82d821cfd6f6c364a001fbea5
child 331479 6d973d2f1bae985278756e086e7e8b5c094d0806
push id6048
push userkmoir@mozilla.com
push dateMon, 06 Jun 2016 19:02:08 +0000
treeherdermozilla-beta@46d72a56c57d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerspadenot
bugs881587, 815643
milestone48.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 881587 - Use SSE2 version of AudioNodeEngine.cpp routines added in bug 815643. r=padenot MozReview-Commit-ID: 3cfU3oTruAC
dom/media/webaudio/AudioNodeEngine.cpp
dom/media/webaudio/AudioNodeEngineSSE2.cpp
--- a/dom/media/webaudio/AudioNodeEngine.cpp
+++ b/dom/media/webaudio/AudioNodeEngine.cpp
@@ -130,16 +130,24 @@ AudioBlockCopyChannelWithScale(const flo
 }
 
 void
 BufferComplexMultiply(const float* aInput,
                       const float* aScale,
                       float* aOutput,
                       uint32_t aSize)
 {
+
+#ifdef USE_SSE2
+  if (mozilla::supports_sse()) {
+    BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
+    return;
+  }
+#endif
+
   for (uint32_t i = 0; i < aSize * 2; i += 2) {
     float real1 = aInput[i];
     float imag1 = aInput[i + 1];
     float real2 = aScale[i];
     float imag2 = aScale[i + 1];
     float realResult = real1 * real2 - imag1 * imag2;
     float imagResult = real1 * imag2 + imag1 * real2;
     aOutput[i] = realResult;
@@ -308,16 +316,37 @@ AudioBlockPanStereoToStereo(const float 
     }
   }
 }
 
 float
 AudioBufferSumOfSquares(const float* aInput, uint32_t aLength)
 {
   float sum = 0.0f;
+
+#ifdef USE_SSE2
+  if (mozilla::supports_sse()) {
+    const float* alignedInput = ALIGNED16(aInput);
+    float vLength = (aLength >> 4) << 4;
+
+    // use scalar operations for any unaligned data at the beginning
+    while (aInput != alignedInput) {
+        sum += *aInput * *aInput;
+        ++aInput;
+    }
+
+    sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);
+
+    // adjust aInput and aLength to use scalar operations for any
+    // remaining values
+    aInput = alignedInput + 1;
+    aLength -= vLength;
+  }
+#endif
+
   while (aLength--) {
     sum += *aInput * *aInput;
     ++aInput;
   }
   return sum;
 }
 
 void
--- a/dom/media/webaudio/AudioNodeEngineSSE2.cpp
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
@@ -218,16 +218,21 @@ void BufferComplexMultiply_SSE(const flo
                                float* aOutput,
                                uint32_t aSize)
 {
   unsigned i;
   __m128 in0, in1, in2, in3,
          outreal0, outreal1, outreal2, outreal3,
          outimag0, outimag1, outimag2, outimag3;
 
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aScale);
+  ASSERT_ALIGNED16(aOutput);
+  ASSERT_MULTIPLE16(aSize);
+
   for (i = 0; i < aSize * 2; i += 16) {
     in0 = _mm_load_ps(&aInput[i]);
     in1 = _mm_load_ps(&aInput[i + 4]);
     in2 = _mm_load_ps(&aInput[i + 8]);
     in3 = _mm_load_ps(&aInput[i + 12]);
 
     outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
     outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
@@ -268,16 +273,19 @@ void BufferComplexMultiply_SSE(const flo
 float
 AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength)
 {
   unsigned i;
   __m128 in0, in1, in2, in3,
          acc0, acc1, acc2, acc3;
   float out[4];
 
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_MULTIPLE16(aLength);
+
   acc0 = _mm_setzero_ps();
   acc1 = _mm_setzero_ps();
   acc2 = _mm_setzero_ps();
   acc3 = _mm_setzero_ps();
 
   for (i = 0; i < aLength; i+=16) {
     in0 = _mm_load_ps(&aInput[i]);
     in1 = _mm_load_ps(&aInput[i + 4]);