Bug 509052: Add new, faster blurring code. r=derf
authorBas Schouten <bschouten@mozilla.com>
Wed, 07 Nov 2012 09:29:54 +0100
changeset 120509 3169efab0148170c69039d2ae592c40b4141a46e
parent 120508 0825269eee60917b84f802e0cd648e67a020d217
child 120510 f520b6cc921f5630ab39a49dd8b2bdde2f011a64
push id1997
push userakeybl@mozilla.com
push dateMon, 07 Jan 2013 21:25:26 +0000
treeherdermozilla-beta@4baf45cdcf21 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersderf
bugs509052
milestone19.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 509052: Add new, faster blurring code. r=derf
gfx/2d/Blur.cpp
gfx/2d/Blur.h
gfx/2d/BlurSSE2.cpp
gfx/2d/ImageScalingSSE2.cpp
gfx/2d/Makefile.in
gfx/2d/SSEHelpers.h
gfx/2d/Tools.h
--- a/gfx/2d/Blur.cpp
+++ b/gfx/2d/Blur.cpp
@@ -7,16 +7,19 @@
 #include <algorithm>
 #include <math.h>
 #include <string.h>
 
 #include "mozilla/CheckedInt.h"
 #include "mozilla/Constants.h"
 #include "mozilla/Util.h"
 
+#include "2D.h"
+#include "Tools.h"
+
 using namespace std;
 
 namespace mozilla {
 namespace gfx {
 
 /**
  * Box blur involves looking at one pixel, and setting its value to the average
  * of its neighbouring pixels.
@@ -306,18 +309,18 @@ SpreadVertical(unsigned char* aInput,
             for (int32_t s = sMin; s <= sMax; ++s) {
                 v = max<int32_t>(v, aInput[aStride * s + x]);
             }
             aOutput[aStride * y + x] = v;
         }
     }
 }
 
-static CheckedInt<int32_t>
-RoundUpToMultipleOf4(int32_t aVal)
+CheckedInt<int32_t>
+AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal)
 {
   CheckedInt<int32_t> val(aVal);
 
   val += 3;
   val /= 4;
   val *= 4;
 
   return val;
@@ -373,20 +376,21 @@ AlphaBoxBlur::AlphaBoxBlur(const Rect& a
   } else {
     mSkipRect = IntRect(0, 0, 0, 0);
   }
 
   CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.width);
   if (stride.isValid()) {
     mStride = stride.value();
 
-    CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height *
-                               sizeof(unsigned char);
+    // We need to leave room for an additional 3 bytes for a potential overrun
+    // in our blurring code.
+    CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height + 3;
     if (size.isValid()) {
-      mData = static_cast<unsigned char*>(malloc(size.value()));
+      mData = new uint8_t[size.value()];
       memset(mData, 0, size.value());
     }
   }
 }
 
 AlphaBoxBlur::AlphaBoxBlur(uint8_t* aData,
                            const Rect& aRect,
                            int32_t aStride,
@@ -400,17 +404,17 @@ AlphaBoxBlur::AlphaBoxBlur(uint8_t* aDat
     mStride(aStride)
 {
 }
 
 
 AlphaBoxBlur::~AlphaBoxBlur()
 {
   if (mFreeData) {
-    free(mData);
+    delete [] mData;
   }
 }
 
 unsigned char*
 AlphaBoxBlur::GetData()
 {
   return mData;
 }
@@ -450,52 +454,248 @@ AlphaBoxBlur::Blur()
   if (!mData) {
     return;
   }
 
   // no need to do all this if not blurring or spreading
   if (mBlurRadius != IntSize(0,0) || mSpreadRadius != IntSize(0,0)) {
     int32_t stride = GetStride();
 
-    // No need to use CheckedInt here - we have validated it in the constructor.
-    size_t szB = stride * GetSize().height * sizeof(unsigned char);
-    unsigned char* tmpData = static_cast<unsigned char*>(malloc(szB));
-    if (!tmpData)
-      return; // OOM
-
-    memset(tmpData, 0, szB);
+    IntSize size = GetSize();
 
     if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
+      // No need to use CheckedInt here - we have validated it in the constructor.
+      size_t szB = stride * size.height;
+      unsigned char* tmpData = new uint8_t[szB];
+
+      memset(tmpData, 0, szB);
+
       SpreadHorizontal(mData, tmpData, mSpreadRadius.width, GetSize().width, GetSize().height, stride, mSkipRect);
       SpreadVertical(tmpData, mData, mSpreadRadius.height, GetSize().width, GetSize().height, stride, mSkipRect);
+
+      delete [] tmpData;
     }
 
-    if (mBlurRadius.width > 0) {
-      int32_t lobes[3][2];
-      ComputeLobes(mBlurRadius.width, lobes);
-      BoxBlurHorizontal(mData, tmpData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
-      BoxBlurHorizontal(tmpData, mData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
-      BoxBlurHorizontal(mData, tmpData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
+    int32_t horizontalLobes[3][2];
+    ComputeLobes(mBlurRadius.width, horizontalLobes);
+    int32_t verticalLobes[3][2];
+    ComputeLobes(mBlurRadius.height, verticalLobes);
+
+    // We want to allow for some extra space on the left for alignment reasons.
+    int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
+
+    IntSize integralImageSize(size.width + maxLeftLobe + horizontalLobes[1][1],
+                              size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
+
+#ifdef IS_BIG_ENDIAN
+    const bool cIsBigEndian = true;
+#else
+    const bool cIsBigEndian = false;
+#endif
+
+    if (cIsBigEndian || (integralImageSize.width * integralImageSize.height) > (1 << 24)) {
+      // Fallback to old blurring code when the surface is so large it may
+      // overflow our integral image!
+
+      // No need to use CheckedInt here - we have validated it in the constructor.
+      size_t szB = stride * size.height;
+      unsigned char* tmpData = new uint8_t[szB];
+
+      memset(tmpData, 0, szB);
+
+      if (mBlurRadius.width > 0) {
+        BoxBlurHorizontal(mData, tmpData, horizontalLobes[0][0], horizontalLobes[0][1], stride, GetSize().height, mSkipRect);
+        BoxBlurHorizontal(tmpData, mData, horizontalLobes[1][0], horizontalLobes[1][1], stride, GetSize().height, mSkipRect);
+        BoxBlurHorizontal(mData, tmpData, horizontalLobes[2][0], horizontalLobes[2][1], stride, GetSize().height, mSkipRect);
+      } else {
+        uint8_t *tmp = mData;
+        mData = tmpData;
+        tmpData = tmp;
+      }
+      if (mBlurRadius.height > 0) {
+        BoxBlurVertical(tmpData, mData, verticalLobes[0][0], verticalLobes[0][1], stride, GetSize().height, mSkipRect);
+        BoxBlurVertical(mData, tmpData, verticalLobes[1][0], verticalLobes[1][1], stride, GetSize().height, mSkipRect);
+        BoxBlurVertical(tmpData, mData, verticalLobes[2][0], verticalLobes[2][1], stride, GetSize().height, mSkipRect);
+      } else {
+        uint8_t *tmp = mData;
+        mData = tmpData;
+        tmpData = tmp;
+      }
+
+      delete [] tmpData;
     } else {
-      memcpy(tmpData, mData, stride * GetSize().height);
-    }
+      size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width * 4);
+
+      // We need to leave room for an additional 12 bytes for a maximum overrun
+      // of 3 pixels in the blurring code.
+      AlignedArray<uint32_t> integralImage((integralImageStride / 4) * integralImageSize.height + 12);
 
-    if (mBlurRadius.height > 0) {
-      int32_t lobes[3][2];
-      ComputeLobes(mBlurRadius.height, lobes);
-      BoxBlurVertical(tmpData, mData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
-      BoxBlurVertical(mData, tmpData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
-      BoxBlurVertical(tmpData, mData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
-    } else {
-      memcpy(mData, tmpData, stride * GetSize().height);
+#ifdef USE_SSE2
+      if (Factory::HasSSE2()) {
+        BoxBlur_SSE2(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
+                     verticalLobes[0][1], integralImage, integralImageStride);
+        BoxBlur_SSE2(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
+                     verticalLobes[1][1], integralImage, integralImageStride);
+        BoxBlur_SSE2(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
+                     verticalLobes[2][1], integralImage, integralImageStride);
+      } else
+#endif
+      {
+        BoxBlur_C(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
+                  verticalLobes[0][1], integralImage, integralImageStride);
+        BoxBlur_C(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
+                  verticalLobes[1][1], integralImage, integralImageStride);
+        BoxBlur_C(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
+                  verticalLobes[2][1], integralImage, integralImageStride);
+      }
     }
+  }
+}
 
-    free(tmpData);
+MOZ_ALWAYS_INLINE void
+GenerateIntegralRow(uint32_t  *aDest, const uint8_t *aSource, uint32_t *aPreviousRow,
+                    const uint32_t &aSourceWidth, const uint32_t &aLeftInflation, const uint32_t &aRightInflation)
+{
+  uint32_t currentRowSum = 0;
+  uint32_t pixel = aSource[0];
+  for (uint32_t x = 0; x < aLeftInflation; x++) {
+    currentRowSum += pixel;
+    *aDest++ = currentRowSum + *aPreviousRow++;
+  }
+  for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) {
+      uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+      alphaValues >>= 8;
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+      alphaValues >>= 8;
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+      alphaValues >>= 8;
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+  }
+  pixel = aSource[aSourceWidth - 1];
+  for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
+    currentRowSum += pixel;
+    *aDest++ = currentRowSum + *aPreviousRow++;
+  }
+}
+
+MOZ_ALWAYS_INLINE void
+GenerateIntegralImage_C(int32_t aLeftInflation, int32_t aRightInflation,
+                        int32_t aTopInflation, int32_t aBottomInflation,
+                        uint32_t *aIntegralImage, size_t aIntegralImageStride,
+                        uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
+{
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
+                            aSize.height + aTopInflation + aBottomInflation);
+
+  memset(aIntegralImage, 0, aIntegralImageStride);
+
+  GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage,
+                      aSize.width, aLeftInflation, aRightInflation);
+  for (int y = 1; y < aTopInflation + 1; y++) {
+    uint32_t *intRow = aIntegralImage + (y * stride32bit);
+    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
+    uint32_t *intFirstRow = aIntegralImage;
+
+    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit,
+                        aSize.width, aLeftInflation, aRightInflation);
   }
 
+  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
+    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation),
+                        aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation);
+  }
+
+  if (aBottomInflation) {
+    for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) {
+      GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride),
+                          aIntegralImage + (y - 1) * stride32bit,
+                          aSize.width, aLeftInflation, aRightInflation);
+    }
+  }
+}
+
+/**
+ * Attempt to do an in-place box blur using an integral image.
+ */
+void
+AlphaBoxBlur::BoxBlur_C(int32_t aLeftLobe,
+                        int32_t aRightLobe,
+                        int32_t aTopLobe,
+                        int32_t aBottomLobe,
+                        uint32_t *aIntegralImage,
+                        size_t aIntegralImageStride)
+{
+  IntSize size = GetSize();
+
+  MOZ_ASSERT(size.width > 0);
+
+  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
+  // looking at an integral image the value of a pixel at 'x,y' is calculated
+  // using the value of the integral image values above/below that.
+  aLeftLobe++;
+  aTopLobe++;
+  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
+
+  MOZ_ASSERT(boxSize > 0);
+
+  if (boxSize == 1) {
+      return;
+  }
+
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
+
+  GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
+                          aIntegralImage, aIntegralImageStride, mData,
+                          mStride, size);
+
+  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
+
+  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
+
+  // Storing these locally makes this about 30% faster! Presumably the compiler
+  // can't be sure we're not altering the member variables in this loop.
+  IntRect skipRect = mSkipRect;
+  uint8_t *data = mData;
+  int32_t stride = mStride;
+  for (int32_t y = 0; y < size.height; y++) {
+    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
+
+    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
+    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
+    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
+    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
+
+    for (int32_t x = 0; x < size.width; x++) {
+      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
+        x = skipRect.XMost() - 1;
+        // Trigger early jump on coming loop iterations, this will be reset
+        // next line anyway.
+        inSkipRectY = false;
+        continue;
+      }
+      int32_t topLeft = topLeftBase[x];
+      int32_t topRight = topRightBase[x];
+      int32_t bottomRight = bottomRightBase[x];
+      int32_t bottomLeft = bottomLeftBase[x];
+
+      uint32_t value = bottomRight - topRight - bottomLeft;
+      value += topLeft;
+
+      data[stride * y + x] = (uint64_t(reciprocal) * value) >> 32;
+    }
+  }
 }
 
 /**
  * Compute the box blur size (which we're calling the blur radius) from
  * the standard deviation.
  *
  * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
  * approximating a Gaussian using box blurs.  This yields quite a good
--- a/gfx/2d/Blur.h
+++ b/gfx/2d/Blur.h
@@ -2,16 +2,17 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef MOZILLA_GFX_BLUR_H_
 #define MOZILLA_GFX_BLUR_H_
 
 #include "mozilla/gfx/Rect.h"
 #include "mozilla/gfx/Point.h"
+#include "mozilla/CheckedInt.h"
 
 namespace mozilla {
 namespace gfx {
 
 #ifdef _MSC_VER
 #pragma warning( disable : 4251 )
 #endif
 
@@ -109,16 +110,23 @@ public:
    * Gaussian blur with the given standard deviation.  The result of this
    * function should be used as the aBlurRadius parameter to AlphaBoxBlur's
    * constructor, above.
    */
   static IntSize CalculateBlurRadius(const Point& aStandardDeviation);
 
 private:
 
+  void BoxBlur_C(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
+                 int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
+  void BoxBlur_SSE2(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
+                    int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
+
+  static CheckedInt<int32_t> RoundUpToMultipleOf4(int32_t aVal);
+
   /**
    * A rect indicating the area where blurring is unnecessary, and the blur
    * algorithm should skip over it.
    */
   IntRect mSkipRect;
 
   /**
    * The device-space rectangle the the backing 8-bit alpha surface covers.
new file mode 100644
--- /dev/null
+++ b/gfx/2d/BlurSSE2.cpp
@@ -0,0 +1,250 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Blur.h"
+
+#include "SSEHelpers.h"
+
+#include <string.h>
+
+namespace mozilla {
+namespace gfx {
+
+MOZ_ALWAYS_INLINE
+uint32_t DivideAndPack(__m128i aValues, __m128i aDivisor, __m128i aMask)
+{
+  __m128i multiplied = _mm_srli_epi64(_mm_mul_epu32(aValues, aDivisor), 32); // 00p300p1
+  multiplied = _mm_or_si128(multiplied, _mm_and_si128(_mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor),
+    aMask)); // p4p3p2p1
+  __m128i final = _mm_packus_epi16(_mm_packs_epi32(multiplied, _mm_setzero_si128()), _mm_setzero_si128());
+
+  return _mm_cvtsi128_si32(final);
+}
+
+MOZ_ALWAYS_INLINE
+void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
+                            int32_t aSourceWidth, int32_t aLeftInflation,
+                            int32_t aRightInflation)
+{
+  int32_t currentRowSum = 0;
+
+  for (int x = 0; x < aLeftInflation; x++) {
+    currentRowSum += aSource[0];
+    aDest[x] = currentRowSum;
+  }
+  for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
+    currentRowSum += aSource[(x - aLeftInflation)];
+    aDest[x] = currentRowSum;
+  }
+  for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
+    currentRowSum += aSource[aSourceWidth - 1];
+    aDest[x] = currentRowSum;
+  }
+}
+
+// This function calculates an integral of four pixels stored in the 4
+// 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
+// { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
+// much testing.
+MOZ_ALWAYS_INLINE
+__m128i AccumulatePixelSums(__m128i aPixels)
+{
+  __m128i sumPixels = aPixels;
+  __m128i currentPixels = _mm_slli_si128(aPixels, 4);
+  sumPixels = _mm_add_epi32(sumPixels, currentPixels);
+  currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
+
+  return _mm_add_epi32(sumPixels, currentPixels);
+}
+
+MOZ_ALWAYS_INLINE void
+GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
+                           int32_t aTopInflation, int32_t aBottomInflation,
+                           uint32_t *aIntegralImage, size_t aIntegralImageStride,
+                           uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
+{
+  MOZ_ASSERT(!(aLeftInflation & 3));
+
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
+                            aSize.height + aTopInflation + aBottomInflation);
+
+  LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
+
+  for (int y = 1; y < aTopInflation + 1; y++) {
+    uint32_t *intRow = aIntegralImage + (y * stride32bit);
+    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
+    uint32_t *intFirstRow = aIntegralImage;
+
+    for (int x = 0; x < integralImageSize.width; x += 4) {
+      __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
+      __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
+    }
+  }
+
+  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
+    __m128i currentRowSum = _mm_setzero_si128();
+    uint32_t *intRow = aIntegralImage + (y * stride32bit);
+    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
+    uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
+
+    uint32_t pixel = sourceRow[0];
+    for (int x = 0; x < aLeftInflation; x += 4) {
+      __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
+
+      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
+
+      currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
+
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
+    }
+    for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
+      uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
+
+      // It's important to shuffle here. When we exit this loop currentRowSum
+      // has to be set to sumPixels, so that the following loop can get the
+      // correct pixel for the currentRowSum. The highest order pixel in
+      // currentRowSum could've originated from accumulation in the stride.
+      currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
+
+      __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
+      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
+
+      currentRowSum = sumPixels;
+
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
+    }
+
+    pixel = sourceRow[aSize.width - 1];
+    int x = (aSize.width + aLeftInflation);
+    if ((aSize.width & 3)) {
+      // Deal with unaligned portion. Get the correct pixel from currentRowSum,
+      // see explanation above.
+      uint32_t intCurrentRowSum = ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
+      for (; x < integralImageSize.width; x++) {
+        // We could be unaligned here!
+        if (!(x & 3)) {
+          // aligned!
+          currentRowSum = _mm_set1_epi32(intCurrentRowSum);
+          break;
+        }
+        intCurrentRowSum += pixel;
+        intRow[x] = intPrevRow[x] + intCurrentRowSum;
+      }
+    } else {
+      currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
+    }
+    for (; x < integralImageSize.width; x += 4) {
+      __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
+
+      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
+
+      currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
+
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
+    }
+  }
+
+  if (aBottomInflation) {
+    // Store the last valid row of our source image in the last row of
+    // our integral image. This will be overwritten with the correct values
+    // in the upcoming loop.
+    LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
+                           aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
+
+
+    for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
+      __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
+      __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
+      __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
+
+      for (int x = 0; x < integralImageSize.width; x += 4) {
+        _mm_store_si128(intRow + (x / 4),
+                        _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
+                                      _mm_load_si128(intPrevRow + (x / 4))));
+      }
+    }
+  }
+}
+
+/**
+ * Attempt to do an in-place box blur using an integral image.
+ */
+void
+AlphaBoxBlur::BoxBlur_SSE2(int32_t aLeftLobe,
+                           int32_t aRightLobe,
+                           int32_t aTopLobe,
+                           int32_t aBottomLobe,
+                           uint32_t *aIntegralImage,
+                           size_t aIntegralImageStride)
+{
+  IntSize size = GetSize();
+
+  MOZ_ASSERT(size.height > 0);
+
+  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
+  // looking at an integral image the value of a pixel at 'x,y' is calculated
+  // using the value of the integral image values above/below that.
+  aLeftLobe++;
+  aTopLobe++;
+  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
+
+  MOZ_ASSERT(boxSize > 0);
+
+  if (boxSize == 1) {
+      return;
+  }
+
+  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
+
+  uint32_t stride32bit = aIntegralImageStride / 4;
+  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
+
+  GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
+                             aIntegralImage, aIntegralImageStride, mData,
+                             mStride, size);
+
+  __m128i divisor = _mm_set1_epi32(reciprocal);
+  __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
+
+  // This points to the start of the rectangle within the IntegralImage that overlaps
+  // the surface being blurred.
+  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
+
+  IntRect skipRect = mSkipRect;
+  int32_t stride = mStride;
+  uint8_t *data = mData;
+  for (int32_t y = 0; y < size.height; y++) {
+    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
+
+    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
+    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
+    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
+    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
+
+    for (int32_t x = 0; x < size.width; x += 4) {
+      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
+        x = skipRect.XMost() - 4;
+        // Trigger early jump on coming loop iterations, this will be reset
+        // next line anyway.
+        inSkipRectY = false;
+        continue;
+      }
+      __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
+      __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
+      __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
+      __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
+
+      __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(bottomRight, topRight), bottomLeft), topLeft);
+
+      *(uint32_t*)(data + stride * y + x) = DivideAndPack(values, divisor, mask);
+    }
+  }
+
+}
+
+}
+}
--- a/gfx/2d/ImageScalingSSE2.cpp
+++ b/gfx/2d/ImageScalingSSE2.cpp
@@ -1,18 +1,17 @@
 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "ImageScaling.h"
 #include "mozilla/Attributes.h"
 
-#include <xmmintrin.h>
-#include <emmintrin.h>
+#include "SSEHelpers.h"
 
 /* The functions below use the following system for averaging 4 pixels:
  *
  * The first observation is that a half-adder is implemented as follows:
  * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
  *
  * This can be trivially extended to three pixels by observaring that when
  * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
@@ -103,27 +102,16 @@ MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4
 {
   __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
   b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
   a = t;
 
   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 }
 
-/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
- * faster. Once enough people are on architectures where _mm_loadu_si128 is
- * fast we can migrate to it.
- */
-MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
-{
-  // Yes! We use uninitialized memory here, we'll overwrite it though!
-  __m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
-  return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
-}
-
 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 {
   uint32_t sum = a ^ b ^ c;
   uint32_t carry = (a & b) | (a & c) | (b & c);
 
   uint32_t mask = 0xfefefefe;
 
   // Not having a byte based average instruction means we should mask to avoid
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@@ -111,17 +111,20 @@ DEFINES += -DSK_A32_SHIFT=24 -DSK_R32_SH
 ifdef MOZ_DEBUG
 DEFINES += -DGFX_LOG_DEBUG -DGFX_LOG_WARNING
 endif
 
 # Are we targeting x86 or x64?  If so, build SSE2 files.
 ifneq (,$(INTEL_ARCHITECTURE))
 # VC2005 doesn't support _mm_castsi128_ps, so SSE2 is turned off
 ifneq (1400,$(_MSC_VER))
-CPPSRCS += ImageScalingSSE2.cpp
+CPPSRCS += \
+        ImageScalingSSE2.cpp \
+        BlurSSE2.cpp \
+        $(NULL)
 DEFINES += -DUSE_SSE2
 endif
 endif
 
 ifeq ($(MOZ_WIDGET_TOOLKIT),windows)
 CPPSRCS	+= \
         DrawTargetD2D.cpp \
         SourceSurfaceD2D.cpp \
@@ -156,16 +159,18 @@ DEFINES := $(filter-out -DUNICODE -D_UNI
 #EXTRA_DSO_LDOPTS += -framework OpenGL -framework AGL -framework QuickTime -framework AppKit -framework QuartzCore
 #endif
 
 # The file uses SSE2 intrinsics, so it needs special compile flags on some
 # compilers.
 ifneq (,$(INTEL_ARCHITECTURE))
 ifdef GNU_CC
 ImageScalingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
+BlurSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
 endif
 
 ifdef SOLARIS_SUNPRO_CXX
 ImageScalingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
+BlurSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
 endif
 endif
 
 CXXFLAGS += $(MOZ_CAIRO_CFLAGS) $(MOZ_PIXMAN_CFLAGS)
new file mode 100644
--- /dev/null
+++ b/gfx/2d/SSEHelpers.h
@@ -0,0 +1,17 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
+ * faster. Once enough people are on architectures where _mm_loadu_si128 is
+ * fast we can migrate to it.
+ */
+MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
+{
+  // Yes! We use uninitialized memory here, we'll overwrite it though!
+  __m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
+  return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
+}
--- a/gfx/2d/Tools.h
+++ b/gfx/2d/Tools.h
@@ -76,12 +76,70 @@ BytesPerPixel(SurfaceFormat aFormat)
     return 1;
   case FORMAT_R5G6B5:
     return 2;
   default:
     return 4;
   }
 }
 
+template<typename T, int alignment = 16>
+struct AlignedArray
+{
+  AlignedArray()
+    : mStorage(nullptr)
+    , mPtr(nullptr)
+  {
+  }
+
+  MOZ_ALWAYS_INLINE AlignedArray(size_t aSize)
+    : mStorage(nullptr)
+  {
+    Realloc(aSize);
+  }
+
+  MOZ_ALWAYS_INLINE ~AlignedArray()
+  {
+    delete [] mStorage;
+  }
+
+  void Dealloc()
+  {
+    delete [] mStorage;
+    mStorage = mPtr = nullptr;
+  }
+
+  MOZ_ALWAYS_INLINE void Realloc(size_t aSize)
+  {
+    delete [] mStorage;
+    mStorage = new T[aSize + (alignment - 1)];
+    if (uintptr_t(mStorage) % alignment) {
+      // Our storage does not start at a <alignment>-byte boundary. Make sure mData does!
+      mPtr = (uint32_t*)(uintptr_t(mStorage) +
+        (alignment - (uintptr_t(mStorage) % alignment)));
+    } else {
+      mPtr = mStorage;
+    }
+  }
+
+  MOZ_ALWAYS_INLINE operator T*()
+  {
+    return mPtr;
+  }
+
+  T *mStorage;
+  T *mPtr;
+};
+
+template<int alignment>
+int32_t GetAlignedStride(int32_t aStride)
+{
+  if (aStride % alignment) {
+    return aStride + (alignment - (aStride % alignment));
+  }
+
+  return aStride;
+}
+
 }
 }
 
 #endif /* MOZILLA_GFX_TOOLS_H_ */