Bug 926838 - [Part 4] Implement AlignedTArray for 32-byte alignment is required by openmax dl. Also modify callers. r=ehsan
☠☠ backed out by 2d0b99af10da ☠ ☠
authorJW Wang <jwwang@mozilla.com>
Wed, 13 Nov 2013 11:07:31 +0800
changeset 272194 2a08b161b02ea5c9f49e1b6ec82cdbad717c159e
parent 272193 041ed2e081684857b98178bc256e2b428f81bad6
child 272195 1ada6df926ba28089114de4ab58acf4a39324926
push id4830
push userjlund@mozilla.com
push dateMon, 29 Jun 2015 20:18:48 +0000
treeherdermozilla-beta@4c2175bb0420 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersehsan
bugs926838
milestone40.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 926838 - [Part 4] Implement AlignedTArray for 32-byte alignment is required by openmax dl. Also modify callers. r=ehsan
dom/media/webaudio/AlignedTArray.h
dom/media/webaudio/AnalyserNode.cpp
dom/media/webaudio/AnalyserNode.h
dom/media/webaudio/FFTBlock.cpp
dom/media/webaudio/FFTBlock.h
dom/media/webaudio/blink/FFTConvolver.h
dom/media/webaudio/blink/HRTFKernel.cpp
dom/media/webaudio/blink/HRTFPanner.h
dom/media/webaudio/blink/PeriodicWave.cpp
dom/media/webaudio/blink/PeriodicWave.h
dom/media/webaudio/moz.build
new file mode 100644
--- /dev/null
+++ b/dom/media/webaudio/AlignedTArray.h
@@ -0,0 +1,85 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef AlignedTArray_h__
+#define AlignedTArray_h__
+
+#include "mozilla/Alignment.h"
+#include "nsTArray.h"
+
+/**
+ * E: element type, must be a POD type.
+ * N: N bytes alignment for the first element, defaults to 32
+  */
+template <typename E, int N, typename Alloc>
+class AlignedTArray_Impl : public nsTArray_Impl<E, Alloc>
+{
+  static_assert((N & (N-1)) == 0, "N must be power of 2");
+  typedef nsTArray_Impl<E, Alloc>                    base_type;
+public:
+  typedef E                                          elem_type;
+  typedef typename base_type::size_type              size_type;
+  typedef typename base_type::index_type             index_type;
+
+  AlignedTArray_Impl() {}
+  explicit AlignedTArray_Impl(size_type capacity) : base_type(capacity+sExtra) {}
+  elem_type* Elements() { return getAligned(base_type::Elements()); }
+  const elem_type* Elements() const { return getAligned(base_type::Elements()); }
+  elem_type& operator[](index_type i) { return Elements()[i];}
+  const elem_type& operator[](index_type i) const { return Elements()[i]; }
+
+  typename Alloc::ResultType SetLength(size_type newLen) {
+    return base_type::SetLength(newLen + sExtra);
+  }
+  size_type Length() const {
+    return base_type::Length() <= sExtra ? 0 : base_type::Length() - sExtra;
+  }
+
+private:
+  AlignedTArray_Impl(const AlignedTArray_Impl& other) = delete;
+  void operator=(const AlignedTArray_Impl& other) = delete;
+
+  static const size_type sPadding = N <= MOZ_ALIGNOF(E) ? 0 : N - MOZ_ALIGNOF(E);
+  static const size_type sExtra = (sPadding + sizeof(E) - 1) / sizeof(E);
+
+  template <typename U>
+  static U* getAligned(U* p)
+  {
+    return reinterpret_cast<U*>(((uintptr_t)p + N - 1) & ~(N-1));
+  }
+};
+
+template <typename E, int N=32>
+class AlignedTArray : public AlignedTArray_Impl<E, N, nsTArrayInfallibleAllocator>
+{
+public:
+  typedef AlignedTArray_Impl<E, N, nsTArrayInfallibleAllocator> base_type;
+  typedef AlignedTArray<E, N>                                   self_type;
+  typedef typename base_type::size_type                         size_type;
+
+  AlignedTArray() {}
+  explicit AlignedTArray(size_type capacity) : base_type(capacity) {}
+private:
+  AlignedTArray(const AlignedTArray& other) = delete;
+  void operator=(const AlignedTArray& other) = delete;
+};
+
+template <typename E, int N=32>
+class AlignedFallibleTArray : public AlignedTArray_Impl<E, N, nsTArrayFallibleAllocator>
+{
+public:
+  typedef AlignedTArray_Impl<E, N, nsTArrayFallibleAllocator> base_type;
+  typedef AlignedFallibleTArray<E, N>                         self_type;
+  typedef typename base_type::size_type                       size_type;
+
+  AlignedFallibleTArray() {}
+  explicit AlignedFallibleTArray(size_type capacity) : base_type(capacity) {}
+private:
+  AlignedFallibleTArray(const AlignedFallibleTArray& other) = delete;
+  void operator=(const AlignedFallibleTArray& other) = delete;
+};
+
+#endif // AlignedTArray_h__
--- a/dom/media/webaudio/AnalyserNode.cpp
+++ b/dom/media/webaudio/AnalyserNode.cpp
@@ -246,27 +246,26 @@ AnalyserNode::GetByteTimeDomainData(cons
     buffer[i] = static_cast<unsigned char>(scaled);
   }
 }
 
 bool
 AnalyserNode::FFTAnalysis()
 {
   float* inputBuffer;
-  bool allocated = false;
+  AlignedFallibleTArray<float> tmpBuffer;
   if (mWriteIndex == 0) {
     inputBuffer = mBuffer.Elements();
   } else {
-    inputBuffer = static_cast<float*>(malloc(FftSize() * sizeof(float)));
-    if (!inputBuffer) {
+    if (tmpBuffer.SetLength(FftSize())) {
       return false;
     }
+    inputBuffer = tmpBuffer.Elements();
     memcpy(inputBuffer, mBuffer.Elements() + mWriteIndex, sizeof(float) * (FftSize() - mWriteIndex));
     memcpy(inputBuffer + FftSize() - mWriteIndex, mBuffer.Elements(), sizeof(float) * mWriteIndex);
-    allocated = true;
   }
 
   ApplyBlackmanWindow(inputBuffer, FftSize());
 
   mAnalysisBlock.PerformFFT(inputBuffer);
 
   // Normalize so than an input sine wave at 0dBfs registers as 0dBfs (undo FFT scaling factor).
   const double magnitudeScale = 1.0 / FftSize();
@@ -274,19 +273,16 @@ AnalyserNode::FFTAnalysis()
   for (uint32_t i = 0; i < mOutputBuffer.Length(); ++i) {
     double scalarMagnitude = NS_hypot(mAnalysisBlock.RealData(i),
                                       mAnalysisBlock.ImagData(i)) *
                              magnitudeScale;
     mOutputBuffer[i] = mSmoothingTimeConstant * mOutputBuffer[i] +
                        (1.0 - mSmoothingTimeConstant) * scalarMagnitude;
   }
 
-  if (allocated) {
-    free(inputBuffer);
-  }
   return true;
 }
 
 void
 AnalyserNode::ApplyBlackmanWindow(float* aBuffer, uint32_t aSize)
 {
   double alpha = 0.16;
   double a0 = 0.5 * (1.0 - alpha);
@@ -300,26 +296,26 @@ AnalyserNode::ApplyBlackmanWindow(float*
   }
 }
 
 bool
 AnalyserNode::AllocateBuffer()
 {
   bool result = true;
   if (mBuffer.Length() != FftSize()) {
-    result = mBuffer.SetLength(FftSize());
-    if (result) {
-      memset(mBuffer.Elements(), 0, sizeof(float) * FftSize());
-      mWriteIndex = 0;
+    if (mBuffer.SetLength(FftSize())) {
+      return false;
+    }
+    memset(mBuffer.Elements(), 0, sizeof(float) * FftSize());
+    mWriteIndex = 0;
 
-      result = mOutputBuffer.SetLength(FrequencyBinCount());
-      if (result) {
-        memset(mOutputBuffer.Elements(), 0, sizeof(float) * FrequencyBinCount());
-      }
+    if (mOutputBuffer.SetLength(FrequencyBinCount())) {
+      return false;
     }
+    memset(mOutputBuffer.Elements(), 0, sizeof(float) * FrequencyBinCount());
   }
   return result;
 }
 
 void
 AnalyserNode::AppendChunk(const AudioChunk& aChunk)
 {
   const uint32_t bufferSize = mBuffer.Length();
--- a/dom/media/webaudio/AnalyserNode.h
+++ b/dom/media/webaudio/AnalyserNode.h
@@ -4,16 +4,17 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef AnalyserNode_h_
 #define AnalyserNode_h_
 
 #include "AudioNode.h"
 #include "FFTBlock.h"
+#include "AlignedTArray.h"
 
 namespace mozilla {
 namespace dom {
 
 class AudioContext;
 
 class AnalyserNode final : public AudioNode
 {
@@ -72,17 +73,17 @@ private:
   void ApplyBlackmanWindow(float* aBuffer, uint32_t aSize);
 
 private:
   FFTBlock mAnalysisBlock;
   double mMinDecibels;
   double mMaxDecibels;
   double mSmoothingTimeConstant;
   uint32_t mWriteIndex;
-  FallibleTArray<float> mBuffer;
-  FallibleTArray<float> mOutputBuffer;
+  AlignedFallibleTArray<float> mBuffer;
+  AlignedFallibleTArray<float> mOutputBuffer;
 };
 
 }
 }
 
 #endif
 
--- a/dom/media/webaudio/FFTBlock.cpp
+++ b/dom/media/webaudio/FFTBlock.cpp
@@ -39,36 +39,35 @@ typedef std::complex<double> Complex;
 FFTBlock* FFTBlock::CreateInterpolatedBlock(const FFTBlock& block0, const FFTBlock& block1, double interp)
 {
     FFTBlock* newBlock = new FFTBlock(block0.FFTSize());
 
     newBlock->InterpolateFrequencyComponents(block0, block1, interp);
 
     // In the time-domain, the 2nd half of the response must be zero, to avoid circular convolution aliasing...
     int fftSize = newBlock->FFTSize();
-    nsTArray<float> buffer;
-    buffer.SetLength(fftSize);
+    AlignedTArray<float> buffer(fftSize);
     newBlock->GetInverseWithoutScaling(buffer.Elements());
     AudioBufferInPlaceScale(buffer.Elements(), 1.0f / fftSize, fftSize / 2);
     PodZero(buffer.Elements() + fftSize / 2, fftSize / 2);
 
     // Put back into frequency domain.
     newBlock->PerformFFT(buffer.Elements());
 
     return newBlock;
 }
 
 void FFTBlock::InterpolateFrequencyComponents(const FFTBlock& block0, const FFTBlock& block1, double interp)
 {
     // FIXME : with some work, this method could be optimized
 
-    kiss_fft_cpx* dft = mOutputBuffer.Elements();
+    ComplexU* dft = mOutputBuffer.Elements();
 
-    const kiss_fft_cpx* dft1 = block0.mOutputBuffer.Elements();
-    const kiss_fft_cpx* dft2 = block1.mOutputBuffer.Elements();
+    const ComplexU* dft1 = block0.mOutputBuffer.Elements();
+    const ComplexU* dft2 = block1.mOutputBuffer.Elements();
 
     MOZ_ASSERT(mFFTSize == block0.FFTSize());
     MOZ_ASSERT(mFFTSize == block1.FFTSize());
     double s1base = (1.0 - interp);
     double s2base = interp;
 
     double phaseAccum = 0.0;
     double lastPhase1 = 0.0;
@@ -149,17 +148,17 @@ void FFTBlock::InterpolateFrequencyCompo
 
         dft[i].r = static_cast<float>(mag * cos(phaseAccum));
         dft[i].i = static_cast<float>(mag * sin(phaseAccum));
     }
 }
 
 double FFTBlock::ExtractAverageGroupDelay()
 {
-    kiss_fft_cpx* dft = mOutputBuffer.Elements();
+    ComplexU* dft = mOutputBuffer.Elements();
 
     double aveSum = 0.0;
     double weightSum = 0.0;
     double lastPhase = 0.0;
 
     int halfSize = FFTSize() / 2;
 
     const double kSamplePhaseDelay = (2.0 * M_PI) / double(FFTSize());
@@ -200,17 +199,17 @@ double FFTBlock::ExtractAverageGroupDela
 
     return aveSampleDelay;
 }
 
 void FFTBlock::AddConstantGroupDelay(double sampleFrameDelay)
 {
     int halfSize = FFTSize() / 2;
 
-    kiss_fft_cpx* dft = mOutputBuffer.Elements();
+    ComplexU* dft = mOutputBuffer.Elements();
 
     const double kSamplePhaseDelay = (2.0 * M_PI) / double(FFTSize());
 
     double phaseAdj = -sampleFrameDelay * kSamplePhaseDelay;
 
     // Add constant group delay
     for (int i = 1; i < halfSize; i++) {
         Complex c(dft[i].r, dft[i].i);
--- a/dom/media/webaudio/FFTBlock.h
+++ b/dom/media/webaudio/FFTBlock.h
@@ -2,36 +2,53 @@
 /* vim:set ts=2 sw=2 sts=2 et cindent: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef FFTBlock_h_
 #define FFTBlock_h_
 
-#include "nsTArray.h"
+#ifdef BUILD_ARM_NEON
+#include <cmath>
+#include "mozilla/arm.h"
+#include "dl/sp/api/omxSP.h"
+#endif
+
+#include "AlignedTArray.h"
 #include "AudioNodeEngine.h"
 #include "kiss_fft/kiss_fftr.h"
 
 namespace mozilla {
 
 // This class defines an FFT block, loosely modeled after Blink's FFTFrame
 // class to make sharing code with Blink easy.
 // Currently it's implemented on top of KissFFT on all platforms.
 class FFTBlock final
 {
+  union ComplexU {
+    kiss_fft_cpx c;
+    float f[2];
+    struct {
+      float r;
+      float i;
+    };
+  };
+
 public:
   explicit FFTBlock(uint32_t aFFTSize)
-    : mFFT(nullptr)
-    , mIFFT(nullptr)
-    , mFFTSize(aFFTSize)
+    : mKissFFT(nullptr)
+    , mKissIFFT(nullptr)
+#ifdef BUILD_ARM_NEON
+    , mOmxFFT(nullptr)
+    , mOmxIFFT(nullptr)
+#endif
   {
     MOZ_COUNT_CTOR(FFTBlock);
-    mOutputBuffer.SetLength(aFFTSize / 2 + 1);
-    PodZero(mOutputBuffer.Elements(), aFFTSize / 2 + 1);
+    SetFFTSize(aFFTSize);
   }
   ~FFTBlock()
   {
     MOZ_COUNT_DTOR(FFTBlock);
     Clear();
   }
 
   // Return a new FFTBlock with frequency components interpolated between
@@ -39,70 +56,94 @@ public:
   static FFTBlock*
   CreateInterpolatedBlock(const FFTBlock& block0,
                           const FFTBlock& block1, double interp);
 
   // Transform FFTSize() points of aData and store the result internally.
   void PerformFFT(const float* aData)
   {
     EnsureFFT();
-    kiss_fftr(mFFT, aData, mOutputBuffer.Elements());
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      omxSP_FFTFwd_RToCCS_F32_Sfs(aData, mOutputBuffer.Elements()->f, mOmxFFT);
+    } else
+#endif
+    {
+      kiss_fftr(mKissFFT, aData, &(mOutputBuffer.Elements()->c));
+    }
   }
   // Inverse-transform internal data and store the resulting FFTSize()
-  // points in aData.
+  // points in aDataOut.
   void GetInverse(float* aDataOut)
   {
     GetInverseWithoutScaling(aDataOut);
     AudioBufferInPlaceScale(aDataOut, 1.0f / mFFTSize, mFFTSize);
   }
   // Inverse-transform internal frequency data and store the resulting
   // FFTSize() points in |aDataOut|.  If frequency data has not already been
   // scaled, then the output will need scaling by 1/FFTSize().
   void GetInverseWithoutScaling(float* aDataOut)
   {
     EnsureIFFT();
-    kiss_fftri(mIFFT, mOutputBuffer.Elements(), aDataOut);
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      omxSP_FFTInv_CCSToR_F32_Sfs(mOutputBuffer.Elements()->f, aDataOut, mOmxIFFT);
+      // There is no function that computes de inverse FFT without scaling, so
+      // we have to scale back up here. Bug 1158741.
+      AudioBufferInPlaceScale(aDataOut, mFFTSize, mFFTSize);
+    } else
+#endif
+    {
+      kiss_fftri(mKissIFFT, &(mOutputBuffer.Elements()->c), aDataOut);
+    }
   }
   // Inverse-transform the FFTSize()/2+1 points of data in each
   // of aRealDataIn and aImagDataIn and store the resulting
   // FFTSize() points in aRealDataOut.
   void PerformInverseFFT(float* aRealDataIn,
                          float *aImagDataIn,
                          float *aRealDataOut)
   {
     EnsureIFFT();
     const uint32_t inputSize = mFFTSize / 2 + 1;
-    nsTArray<kiss_fft_cpx> inputBuffer;
-    inputBuffer.SetLength(inputSize);
+    AlignedTArray<ComplexU> inputBuffer(inputSize);
     for (uint32_t i = 0; i < inputSize; ++i) {
       inputBuffer[i].r = aRealDataIn[i];
       inputBuffer[i].i = aImagDataIn[i];
     }
-    kiss_fftri(mIFFT, inputBuffer.Elements(), aRealDataOut);
-    for (uint32_t i = 0; i < mFFTSize; ++i) {
-      aRealDataOut[i] /= mFFTSize;
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      omxSP_FFTInv_CCSToR_F32_Sfs(inputBuffer.Elements()->f,
+                                  aRealDataOut, mOmxIFFT);
+    } else
+#endif
+    {
+      kiss_fftri(mKissIFFT, &(inputBuffer.Elements()->c), aRealDataOut);
+      for (uint32_t i = 0; i < mFFTSize; ++i) {
+        aRealDataOut[i] /= mFFTSize;
+      }
     }
   }
 
   void Multiply(const FFTBlock& aFrame)
   {
-    BufferComplexMultiply(reinterpret_cast<const float*>(mOutputBuffer.Elements()),
-                          reinterpret_cast<const float*>(aFrame.mOutputBuffer.Elements()),
-                          reinterpret_cast<float*>(mOutputBuffer.Elements()),
+    BufferComplexMultiply(mOutputBuffer.Elements()->f,
+                          aFrame.mOutputBuffer.Elements()->f,
+                          mOutputBuffer.Elements()->f,
                           mFFTSize / 2 + 1);
   }
 
   // Perform a forward FFT on |aData|, assuming zeros after dataSize samples,
   // and pre-scale the generated internal frequency domain coefficients so
   // that GetInverseWithoutScaling() can be used to transform to the time
   // domain.  This is useful for convolution kernels.
   void PadAndMakeScaledDFT(const float* aData, size_t dataSize)
   {
     MOZ_ASSERT(dataSize <= FFTSize());
-    nsTArray<float> paddedData;
+    AlignedTArray<float> paddedData;
     paddedData.SetLength(FFTSize());
     AudioBufferCopyWithScale(aData, 1.0f / FFTSize(),
                              paddedData.Elements(), dataSize);
     PodZero(paddedData.Elements() + dataSize, mFFTSize - dataSize);
     PerformFFT(paddedData.Elements());
   }
 
   void SetFFTSize(uint32_t aSize)
@@ -127,54 +168,101 @@ public:
   float ImagData(uint32_t aIndex) const
   {
     return mOutputBuffer[aIndex].i;
   }
 
   size_t SizeOfExcludingThis(MallocSizeOf aMallocSizeOf) const
   {
     size_t amount = 0;
-    amount += aMallocSizeOf(mFFT);
-    amount += aMallocSizeOf(mIFFT);
+    amount += aMallocSizeOf(mKissFFT);
+    amount += aMallocSizeOf(mKissIFFT);
     amount += mOutputBuffer.SizeOfExcludingThis(aMallocSizeOf);
     return amount;
   }
 
   size_t SizeOfIncludingThis(MallocSizeOf aMallocSizeOf) const
   {
     return aMallocSizeOf(this) + SizeOfExcludingThis(aMallocSizeOf);
   }
 
 private:
   FFTBlock(const FFTBlock& other) = delete;
   void operator=(const FFTBlock& other) = delete;
 
   void EnsureFFT()
   {
-    if (!mFFT) {
-      mFFT = kiss_fftr_alloc(mFFTSize, 0, nullptr, nullptr);
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      if (!mOmxFFT) {
+        mOmxFFT = createOmxFFT(mFFTSize);
+      }
+    } else
+#endif
+    {
+      if (!mKissFFT) {
+        mKissFFT = kiss_fftr_alloc(mFFTSize, 0, nullptr, nullptr);
+      }
     }
   }
   void EnsureIFFT()
   {
-    if (!mIFFT) {
-      mIFFT = kiss_fftr_alloc(mFFTSize, 1, nullptr, nullptr);
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      if (!mOmxIFFT) {
+        mOmxIFFT = createOmxFFT(mFFTSize);
+      }
+    } else
+#endif
+    {
+      if (!mKissIFFT) {
+        mKissIFFT = kiss_fftr_alloc(mFFTSize, 1, nullptr, nullptr);
+      }
     }
   }
+
+#ifdef BUILD_ARM_NEON
+  static OMXFFTSpec_R_F32* createOmxFFT(uint32_t aFFTSize)
+  {
+    MOZ_ASSERT((aFFTSize & (aFFTSize-1)) == 0);
+    OMX_INT bufSize;
+    OMX_INT order = log((double)aFFTSize)/M_LN2;
+    MOZ_ASSERT(aFFTSize>>order == 1);
+    OMXResult status = omxSP_FFTGetBufSize_R_F32(order, &bufSize);
+    if (status == OMX_Sts_NoErr) {
+      OMXFFTSpec_R_F32* context = static_cast<OMXFFTSpec_R_F32*>(malloc(bufSize));
+      if (omxSP_FFTInit_R_F32(context, order) != OMX_Sts_NoErr) {
+        return nullptr;
+      }
+      return context;
+    }
+    return nullptr;
+  }
+#endif
+
   void Clear()
   {
-    free(mFFT);
-    free(mIFFT);
-    mFFT = mIFFT = nullptr;
+#ifdef BUILD_ARM_NEON
+    free(mOmxFFT);
+    free(mOmxIFFT);
+    mOmxFFT = mOmxIFFT = nullptr;
+#endif
+    free(mKissFFT);
+    free(mKissIFFT);
+    mKissFFT = mKissIFFT = nullptr;
   }
   void AddConstantGroupDelay(double sampleFrameDelay);
   void InterpolateFrequencyComponents(const FFTBlock& block0,
                                       const FFTBlock& block1, double interp);
 
-  kiss_fftr_cfg mFFT, mIFFT;
-  nsTArray<kiss_fft_cpx> mOutputBuffer;
+  kiss_fftr_cfg mKissFFT;
+  kiss_fftr_cfg mKissIFFT;
+#ifdef BUILD_ARM_NEON
+  OMXFFTSpec_R_F32* mOmxFFT;
+  OMXFFTSpec_R_F32* mOmxIFFT;
+#endif
+  AlignedTArray<ComplexU> mOutputBuffer;
   uint32_t mFFTSize;
 };
-
 }
 
 #endif
 
--- a/dom/media/webaudio/blink/FFTConvolver.h
+++ b/dom/media/webaudio/blink/FFTConvolver.h
@@ -30,17 +30,17 @@
 #define FFTConvolver_h
 
 #include "nsTArray.h"
 #include "mozilla/FFTBlock.h"
 #include "mozilla/MemoryReporting.h"
 
 namespace WebCore {
 
-typedef nsTArray<float> AudioFloatArray;
+typedef AlignedTArray<float> AlignedAudioFloatArray;
 using mozilla::FFTBlock;
 
 class FFTConvolver {
 public:
     // fftSize must be a power of two
     explicit FFTConvolver(size_t fftSize);
 
     // |fftKernel| must be pre-scaled for FFTBlock::GetInverseWithoutScaling().
@@ -61,20 +61,20 @@ public:
     size_t sizeOfExcludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
     size_t sizeOfIncludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
 
 private:
     FFTBlock m_frame;
 
     // Buffer input until we get fftSize / 2 samples then do an FFT
     size_t m_readWriteIndex;
-    AudioFloatArray m_inputBuffer;
+    AlignedAudioFloatArray m_inputBuffer;
 
     // Stores output which we read a little at a time
-    AudioFloatArray m_outputBuffer;
+    AlignedAudioFloatArray m_outputBuffer;
 
     // Saves the 2nd half of the FFT buffer, so we can do an overlap-add with the 1st half of the next one
-    AudioFloatArray m_lastOverlapBuffer;
+    AlignedAudioFloatArray m_lastOverlapBuffer;
 };
 
 } // namespace WebCore
 
 #endif // FFTConvolver_h
--- a/dom/media/webaudio/blink/HRTFKernel.cpp
+++ b/dom/media/webaudio/blink/HRTFKernel.cpp
@@ -46,16 +46,24 @@ static float extractAverageGroupDelay(fl
 
     return frameDelay;
 }
 
 HRTFKernel::HRTFKernel(float* impulseResponse, size_t length, float sampleRate)
     : m_frameDelay(0)
     , m_sampleRate(sampleRate)
 {
+    AlignedTArray<float> buffer;
+    // copy to a 32-byte aligned buffer
+    if (((uintptr_t)impulseResponse & 31) != 0) {
+      buffer.SetLength(length);
+      mozilla::PodCopy(buffer.Elements(), impulseResponse, length);
+      impulseResponse = buffer.Elements();
+    }
+
     // Determine the leading delay (average group delay) for the response.
     m_frameDelay = extractAverageGroupDelay(impulseResponse, length);
 
     // The FFT size (with zero padding) needs to be twice the response length
     // in order to do proper convolution.
     unsigned fftSize = 2 * length;
 
     // Quick fade-out (apply window) at truncation point
@@ -74,26 +82,26 @@ HRTFKernel::HRTFKernel(float* impulseRes
 }
 
 // Interpolates two kernels with x: 0 -> 1 and returns the result.
 nsReturnRef<HRTFKernel> HRTFKernel::createInterpolatedKernel(HRTFKernel* kernel1, HRTFKernel* kernel2, float x)
 {
     MOZ_ASSERT(kernel1 && kernel2);
     if (!kernel1 || !kernel2)
         return nsReturnRef<HRTFKernel>();
- 
+
     MOZ_ASSERT(x >= 0.0 && x < 1.0);
     x = mozilla::clamped(x, 0.0f, 1.0f);
-    
+
     float sampleRate1 = kernel1->sampleRate();
     float sampleRate2 = kernel2->sampleRate();
     MOZ_ASSERT(sampleRate1 == sampleRate2);
     if (sampleRate1 != sampleRate2)
         return nsReturnRef<HRTFKernel>();
-    
+
     float frameDelay = (1 - x) * kernel1->frameDelay() + x * kernel2->frameDelay();
-    
+
     nsAutoPtr<FFTBlock> interpolatedFrame(
         FFTBlock::CreateInterpolatedBlock(*kernel1->fftFrame(), *kernel2->fftFrame(), x));
     return HRTFKernel::create(interpolatedFrame, frameDelay, sampleRate1);
 }
 
 } // namespace WebCore
--- a/dom/media/webaudio/blink/HRTFPanner.h
+++ b/dom/media/webaudio/blink/HRTFPanner.h
@@ -30,16 +30,18 @@
 #include "mozilla/MemoryReporting.h"
 
 namespace mozilla {
 struct AudioChunk;
 }
 
 namespace WebCore {
 
+typedef nsTArray<float> AudioFloatArray;
+
 class HRTFDatabaseLoader;
 
 using mozilla::AudioChunk;
 
 class HRTFPanner {
 public:
     HRTFPanner(float sampleRate, mozilla::TemporaryRef<HRTFDatabaseLoader> databaseLoader);
     ~HRTFPanner();
--- a/dom/media/webaudio/blink/PeriodicWave.cpp
+++ b/dom/media/webaudio/blink/PeriodicWave.cpp
@@ -215,17 +215,17 @@ void PeriodicWave::createBandLimitedTabl
         // Clear any DC-offset.
         realP[0] = 0;
 
         // Clear values which have no effect.
         imagP[0] = 0;
         imagP[halfSize-1] = 0;
 
         // Create the band-limited table.
-        AudioFloatArray* table = new AudioFloatArray(m_periodicWaveSize);
+        AlignedAudioFloatArray* table = new AlignedAudioFloatArray(m_periodicWaveSize);
         m_bandLimitedTables.AppendElement(table);
 
         // Apply an inverse FFT to generate the time-domain table data.
         float* data = m_bandLimitedTables[rangeIndex]->Elements();
         frame.PerformInverseFFT(realP, imagP, data);
 
         // For the first range (which has the highest power), calculate
         // its peak value then compute normalization scale.
--- a/dom/media/webaudio/blink/PeriodicWave.h
+++ b/dom/media/webaudio/blink/PeriodicWave.h
@@ -27,20 +27,22 @@
  */
 
 #ifndef PeriodicWave_h
 #define PeriodicWave_h
 
 #include "mozilla/dom/OscillatorNodeBinding.h"
 #include <nsAutoPtr.h>
 #include <nsTArray.h>
+#include "AlignedTArray.h"
 #include "mozilla/MemoryReporting.h"
 
 namespace WebCore {
 
+typedef AlignedTArray<float> AlignedAudioFloatArray;
 typedef nsTArray<float> AudioFloatArray;
 
 class PeriodicWave {
 public:
     static PeriodicWave* createSine(float sampleRate);
     static PeriodicWave* createSquare(float sampleRate);
     static PeriodicWave* createSawtooth(float sampleRate);
     static PeriodicWave* createTriangle(float sampleRate);
@@ -93,14 +95,14 @@ private:
 
     // Maximum possible number of partials (before culling).
     unsigned maxNumberOfPartials() const;
 
     unsigned numberOfPartialsForRange(unsigned rangeIndex) const;
 
     // Creates tables based on numberOfComponents Fourier coefficients.
     void createBandLimitedTables(const float* real, const float* imag, unsigned numberOfComponents);
-    nsTArray<nsAutoPtr<AudioFloatArray> > m_bandLimitedTables;
+    nsTArray<nsAutoPtr<AlignedAudioFloatArray> > m_bandLimitedTables;
 };
 
 } // namespace WebCore
 
 #endif // PeriodicWave_h
--- a/dom/media/webaudio/moz.build
+++ b/dom/media/webaudio/moz.build
@@ -16,16 +16,17 @@ MOCHITEST_MANIFESTS += [
     'test/mochitest.ini',
 ]
 
 MOCHITEST_CHROME_MANIFESTS += ['test/chrome.ini']
 
 BROWSER_CHROME_MANIFESTS += ['test/browser.ini']
 
 EXPORTS += [
+    'AlignedTArray.h',
     'AudioContext.h',
     'AudioEventTimeline.h',
     'AudioNodeEngine.h',
     'AudioNodeExternalInputStream.h',
     'AudioNodeStream.h',
     'AudioParamTimeline.h',
     'MediaBufferDecoder.h',
     'ThreeDPoint.h',