Bug 486918. Part 1: Import Chromium's higher-quality image scalers, since we know those to be good and shippable. r=jrmuizel
authorJoe Drew <joe@drew.ca>
Thu, 23 Aug 2012 15:36:04 -0400
changeset 108770 717fb1afa612
parent 108769 ff86ec766232
child 108771 780d5ccc064c
push id15681
push userjdrew@mozilla.com
push dateMon, 01 Oct 2012 18:58:30 +0000
treeherdermozilla-inbound@c0873dd40e2d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs486918
milestone18.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 486918. Part 1: Import Chromium's higher-quality image scalers, since we know those to be good and shippable. r=jrmuizel
gfx/2d/HelpersSkia.h
gfx/2d/Makefile.in
gfx/2d/Scale.cpp
gfx/2d/Scale.h
gfx/2d/convolver.cpp
gfx/2d/convolver.h
gfx/2d/image_operations.cpp
gfx/2d/image_operations.h
--- a/gfx/2d/HelpersSkia.h
+++ b/gfx/2d/HelpersSkia.h
@@ -5,16 +5,17 @@
 
 #ifndef MOZILLA_GFX_HELPERSSKIA_H_
 #define MOZILLA_GFX_HELPERSSKIA_H_
 
 #include "2D.h"
 #include "skia/SkCanvas.h"
 #include "skia/SkDashPathEffect.h"
 #include "mozilla/Assertions.h"
+#include <vector>
 
 namespace mozilla {
 namespace gfx {
 
 static inline SkBitmap::Config
 GfxFormatToSkiaConfig(SurfaceFormat format)
 {
   switch (format)
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@@ -24,16 +24,17 @@ EXPORTS_mozilla/gfx	= \
         BaseMargin.h \
         BaseRect.h \
         BaseSize.h \
         Blur.h \
         PathHelpers.h \
         Point.h \
         Matrix.h \
         Rect.h \
+        Scale.h \
         Types.h \
         Tools.h \
         UserData.h \
 	$(NULL)
 
 CPPSRCS	= \
         Factory.cpp \
         Rect.cpp \
@@ -41,16 +42,17 @@ CPPSRCS	= \
         DrawTargetCairo.cpp \
         SourceSurfaceCairo.cpp \
         PathCairo.cpp \
         DrawTargetRecording.cpp \
         PathRecording.cpp \
         RecordedEvent.cpp \
         DrawEventRecorder.cpp \
         Blur.cpp \
+        Scale.cpp \
         ScaledFontBase.cpp \
         DrawTargetDual.cpp \
         ImageScaling.cpp \
         SourceSurfaceRawData.cpp \
         $(NULL)
 
 ifeq (cocoa,$(MOZ_WIDGET_TOOLKIT))
 CPPSRCS	+= \
@@ -71,16 +73,18 @@ endif
 
 DEFINES += -DMOZ_GFX -DUSE_CAIRO -DGFX2D_INTERNAL
 
 ifdef MOZ_ENABLE_SKIA
 CPPSRCS	+= \
         SourceSurfaceSkia.cpp \
         DrawTargetSkia.cpp \
         PathSkia.cpp \
+        convolver.cpp \
+        image_operations.cpp \
         $(NULL)
 
 DEFINES += -DUSE_SKIA
 
 endif
 
 ifeq (cocoa,$(MOZ_WIDGET_TOOLKIT))
 ifdef MOZ_ENABLE_SKIA
@@ -130,16 +134,22 @@ DEFINES += -DWIN32 -DINITGUID
 ifdef MOZ_ENABLE_SKIA
 CPPSRCS += \
         ScaledFontWin.cpp \
         $(NULL)
 endif
 endif
 
 include $(topsrcdir)/config/rules.mk
+include $(topsrcdir)/ipc/chromium/chromium-config.mk
+
+# Due to bug 796023, we can't have -DUNICODE and -D_UNICODE; defining those
+# macros changes the type of LOGFONT to LOGFONTW instead of LOGFONTA. This
+# changes the symbol names of exported C++ functions that use LOGFONT.
+DEFINES := $(filter-out -DUNICODE -D_UNICODE,$(DEFINES))
 
 #ifeq ($(MOZ_WIDGET_TOOLKIT),cocoa)
 #CPPSRCS	+= \
 #        DrawTargetCG.cpp \
 #        SourceSurfaceCG.cpp \
 #	$(NULL)
 #
 ## Always link with OpenGL/AGL
new file mode 100644
--- /dev/null
+++ b/gfx/2d/Scale.cpp
@@ -0,0 +1,54 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Scale.h"
+
+#ifdef USE_SKIA
+#include "HelpersSkia.h"
+#include "skia/SkBitmap.h"
+#include "image_operations.h"
+#endif
+
+namespace mozilla {
+namespace gfx {
+
+bool Scale(uint8_t* srcData, int32_t srcWidth, int32_t srcHeight, int32_t srcStride,
+           uint8_t* dstData, int32_t dstWidth, int32_t dstHeight, int32_t dstStride,
+           SurfaceFormat format)
+{
+#ifdef USE_SKIA
+  bool opaque;
+  if (format == FORMAT_B8G8R8A8) {
+    opaque = false;
+  } else {
+    opaque = true;
+  }
+
+  SkBitmap::Config config = GfxFormatToSkiaConfig(format);
+
+  SkBitmap imgSrc;
+  imgSrc.setConfig(config, srcWidth, srcHeight, srcStride);
+  imgSrc.setPixels(srcData);
+  imgSrc.setIsOpaque(opaque);
+
+  // Rescaler is compatible with 32 bpp only. Convert to RGB32 if needed.
+  if (config != SkBitmap::kARGB_8888_Config) {
+    imgSrc.copyTo(&imgSrc, SkBitmap::kARGB_8888_Config);
+  }
+
+  // This returns an SkBitmap backed by dstData; since it also wrote to dstData,
+  // we don't need to look at that SkBitmap.
+  SkBitmap result = skia::ImageOperations::Resize(imgSrc,
+                                                  skia::ImageOperations::RESIZE_BEST,
+                                                  dstWidth, dstHeight,
+                                                  dstData);
+
+  return result.readyToDraw();
+#else
+  return false;
+#endif
+}
+
+}
+}
new file mode 100644
--- /dev/null
+++ b/gfx/2d/Scale.h
@@ -0,0 +1,36 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_GFX_SCALE_H_
+#define MOZILLA_GFX_SCALE_H_
+
+#include "Types.h"
+
+namespace mozilla {
+namespace gfx {
+
+/**
+ * Scale an image using a high-quality filter.
+ *
+ * Synchronously scales an image and writes the output to the destination in
+ * 32-bit format. The destination must be pre-allocated by the caller.
+ *
+ * Returns true if scaling was successful, and false otherwise. Currently, this
+ * function is implemented using Skia. If Skia is not enabled when building,
+ * calling this function will always return false.
+ *
+ * IMPLEMTATION NOTES:
+ * This API is not currently easily hardware acceleratable. A better API might
+ * take a SourceSurface and return a SourceSurface; the Direct2D backend, for
+ * example, could simply set a status bit on a copy of the image, and use
+ * Direct2D's high-quality scaler at draw time.
+ */
+GFX2D_API bool Scale(uint8_t* srcData, int32_t srcWidth, int32_t srcHeight, int32_t srcStride,
+                     uint8_t* dstData, int32_t dstWidth, int32_t dstHeight, int32_t dstStride,
+                     SurfaceFormat format);
+
+}
+}
+
+#endif /* MOZILLA_GFX_BLUR_H_ */
new file mode 100644
--- /dev/null
+++ b/gfx/2d/convolver.cpp
@@ -0,0 +1,864 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "convolver.h"
+
+#include <algorithm>
+#include "nsAlgorithm.h"
+
+#include "skia/SkTypes.h"
+
+// note: SIMD_SSE2 is not enabled because of bugs, apparently
+
+#if defined(SIMD_SSE2)
+#include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h
+#endif
+
+namespace skia {
+
+namespace {
+
+// Converts the argument to an 8-bit unsigned value by clamping to the range
+// 0-255.
+inline unsigned char ClampTo8(int a) {
+  if (static_cast<unsigned>(a) < 256)
+    return a;  // Avoid the extra check in the common case.
+  if (a < 0)
+    return 0;
+  return 255;
+}
+
+// Stores a list of rows in a circular buffer. The usage is you write into it
+// by calling AdvanceRow. It will keep track of which row in the buffer it
+// should use next, and the total number of rows added.
+class CircularRowBuffer {
+ public:
+  // The number of pixels in each row is given in |source_row_pixel_width|.
+  // The maximum number of rows needed in the buffer is |max_y_filter_size|
+  // (we only need to store enough rows for the biggest filter).
+  //
+  // We use the |first_input_row| to compute the coordinates of all of the
+  // following rows returned by Advance().
+  CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
+                    int first_input_row)
+      : row_byte_width_(dest_row_pixel_width * 4),
+        num_rows_(max_y_filter_size),
+        next_row_(0),
+        next_row_coordinate_(first_input_row) {
+    buffer_.resize(row_byte_width_ * max_y_filter_size);
+    row_addresses_.resize(num_rows_);
+  }
+
+  // Moves to the next row in the buffer, returning a pointer to the beginning
+  // of it.
+  unsigned char* AdvanceRow() {
+    unsigned char* row = &buffer_[next_row_ * row_byte_width_];
+    next_row_coordinate_++;
+
+    // Set the pointer to the next row to use, wrapping around if necessary.
+    next_row_++;
+    if (next_row_ == num_rows_)
+      next_row_ = 0;
+    return row;
+  }
+
+  // Returns a pointer to an "unrolled" array of rows. These rows will start
+  // at the y coordinate placed into |*first_row_index| and will continue in
+  // order for the maximum number of rows in this circular buffer.
+  //
+  // The |first_row_index_| may be negative. This means the circular buffer
+  // starts before the top of the image (it hasn't been filled yet).
+  unsigned char* const* GetRowAddresses(int* first_row_index) {
+    // Example for a 4-element circular buffer holding coords 6-9.
+    //   Row 0   Coord 8
+    //   Row 1   Coord 9
+    //   Row 2   Coord 6  <- next_row_ = 2, next_row_coordinate_ = 10.
+    //   Row 3   Coord 7
+    //
+    // The "next" row is also the first (lowest) coordinate. This computation
+    // may yield a negative value, but that's OK, the math will work out
+    // since the user of this buffer will compute the offset relative
+    // to the first_row_index and the negative rows will never be used.
+    *first_row_index = next_row_coordinate_ - num_rows_;
+
+    int cur_row = next_row_;
+    for (int i = 0; i < num_rows_; i++) {
+      row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
+
+      // Advance to the next row, wrapping if necessary.
+      cur_row++;
+      if (cur_row == num_rows_)
+        cur_row = 0;
+    }
+    return &row_addresses_[0];
+  }
+
+ private:
+  // The buffer storing the rows. They are packed, each one row_byte_width_.
+  std::vector<unsigned char> buffer_;
+
+  // Number of bytes per row in the |buffer_|.
+  int row_byte_width_;
+
+  // The number of rows available in the buffer.
+  int num_rows_;
+
+  // The next row index we should write into. This wraps around as the
+  // circular buffer is used.
+  int next_row_;
+
+  // The y coordinate of the |next_row_|. This is incremented each time a
+  // new row is appended and does not wrap.
+  int next_row_coordinate_;
+
+  // Buffer used by GetRowAddresses().
+  std::vector<unsigned char*> row_addresses_;
+};
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+template<bool has_alpha>
+void ConvolveHorizontally(const unsigned char* src_data,
+                          const ConvolutionFilter1D& filter,
+                          unsigned char* out_row) {
+  // Loop over each pixel on this row in the output image.
+  int num_values = filter.num_values();
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    // Get the filter that determines the current output pixel.
+    int filter_offset, filter_length;
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const unsigned char* row_to_filter = &src_data[filter_offset * 4];
+
+    // Apply the filter to the row to get the destination pixel in |accum|.
+    int accum[4] = {0};
+    for (int filter_x = 0; filter_x < filter_length; filter_x++) {
+      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
+      accum[0] += cur_filter * row_to_filter[filter_x * 4 + 0];
+      accum[1] += cur_filter * row_to_filter[filter_x * 4 + 1];
+      accum[2] += cur_filter * row_to_filter[filter_x * 4 + 2];
+      if (has_alpha)
+        accum[3] += cur_filter * row_to_filter[filter_x * 4 + 3];
+    }
+
+    // Bring this value back in range. All of the filter scaling factors
+    // are in fixed point with kShiftBits bits of fractional part.
+    accum[0] >>= ConvolutionFilter1D::kShiftBits;
+    accum[1] >>= ConvolutionFilter1D::kShiftBits;
+    accum[2] >>= ConvolutionFilter1D::kShiftBits;
+    if (has_alpha)
+      accum[3] >>= ConvolutionFilter1D::kShiftBits;
+
+    // Store the new pixel.
+    out_row[out_x * 4 + 0] = ClampTo8(accum[0]);
+    out_row[out_x * 4 + 1] = ClampTo8(accum[1]);
+    out_row[out_x * 4 + 2] = ClampTo8(accum[2]);
+    if (has_alpha)
+      out_row[out_x * 4 + 3] = ClampTo8(accum[3]);
+  }
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
+                        int filter_length,
+                        unsigned char* const* source_data_rows,
+                        int pixel_width,
+                        unsigned char* out_row) {
+  // We go through each column in the output and do a vertical convolution,
+  // generating one output pixel each time.
+  for (int out_x = 0; out_x < pixel_width; out_x++) {
+    // Compute the number of bytes over in each row that the current column
+    // we're convolving starts at. The pixel will cover the next 4 bytes.
+    int byte_offset = out_x * 4;
+
+    // Apply the filter to one column of pixels.
+    int accum[4] = {0};
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
+      accum[0] += cur_filter * source_data_rows[filter_y][byte_offset + 0];
+      accum[1] += cur_filter * source_data_rows[filter_y][byte_offset + 1];
+      accum[2] += cur_filter * source_data_rows[filter_y][byte_offset + 2];
+      if (has_alpha)
+        accum[3] += cur_filter * source_data_rows[filter_y][byte_offset + 3];
+    }
+
+    // Bring this value back in range. All of the filter scaling factors
+    // are in fixed point with kShiftBits bits of precision.
+    accum[0] >>= ConvolutionFilter1D::kShiftBits;
+    accum[1] >>= ConvolutionFilter1D::kShiftBits;
+    accum[2] >>= ConvolutionFilter1D::kShiftBits;
+    if (has_alpha)
+      accum[3] >>= ConvolutionFilter1D::kShiftBits;
+
+    // Store the new pixel.
+    out_row[byte_offset + 0] = ClampTo8(accum[0]);
+    out_row[byte_offset + 1] = ClampTo8(accum[1]);
+    out_row[byte_offset + 2] = ClampTo8(accum[2]);
+    if (has_alpha) {
+      unsigned char alpha = ClampTo8(accum[3]);
+
+      // Make sure the alpha channel doesn't come out smaller than any of the
+      // color channels. We use premultipled alpha channels, so this should
+      // never happen, but rounding errors will cause this from time to time.
+      // These "impossible" colors will cause overflows (and hence random pixel
+      // values) when the resulting bitmap is drawn to the screen.
+      //
+      // We only need to do this when generating the final output row (here).
+      int max_color_channel = NS_MAX(out_row[byte_offset + 0],
+          NS_MAX(out_row[byte_offset + 1], out_row[byte_offset + 2]));
+      if (alpha < max_color_channel)
+        out_row[byte_offset + 3] = max_color_channel;
+      else
+        out_row[byte_offset + 3] = alpha;
+    } else {
+      // No alpha channel, the image is opaque.
+      out_row[byte_offset + 3] = 0xff;
+    }
+  }
+}
+
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+void ConvolveHorizontally_SSE2(const unsigned char* src_data,
+                               const ConvolutionFilter1D& filter,
+                               unsigned char* out_row) {
+#if defined(SIMD_SSE2)
+  int num_values = filter.num_values();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    __m128i accum = _mm_setzero_si128();
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const __m128i* row_to_filter =
+        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+      __m128i coeff, coeff16;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Load four pixels => unpack the first two pixels to 16 bits =>
+      // multiply with coefficients => accumulate the convolution result.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Duplicate 3rd and 4th coefficients for all channels =>
+      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+      // => accumulate the convolution results.
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      // [16] a3 g3 b3 r3 a2 g2 b2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Advance the pixel and coefficients pointers.
+      row_to_filter += 1;
+      filter_values += 4;
+    }
+
+    // When |filter_length| is not divisible by 4, we need to decimate some of
+    // the filter coefficient that was loaded incorrectly to zero; Other than
+    // that the algorithm is same with above, exceot that the 4th pixel will be
+    // always absent.
+    int r = filter_length&3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8).
+      __m128i coeff, coeff16;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Note: line buffer must be padded to align_up(filter_offset, 16).
+      // We resolve this by use C-version for the last horizontal line.
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    accum = _mm_packs_epi32(accum, zero);
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    accum = _mm_packus_epi16(accum, zero);
+
+    // Store the pixel value of 32 bits.
+    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+    out_row += 4;
+  }
+#endif
+}
+
+// Convolves horizontally along four rows. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
+// refer to that function for detailed comments.
+void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
+                                const ConvolutionFilter1D& filter,
+                                unsigned char* out_row[4]) {
+#if defined(SIMD_SSE2)
+  int num_values = filter.num_values();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // four pixels in a column per iteration.
+    __m128i accum0 = _mm_setzero_si128();
+    __m128i accum1 = _mm_setzero_si128();
+    __m128i accum2 = _mm_setzero_si128();
+    __m128i accum3 = _mm_setzero_si128();
+    int start = (filter_offset<<2);
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+      __m128i coeff, coeff16lo, coeff16hi;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum)                                          \
+      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
+      src16 = _mm_unpacklo_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      src16 = _mm_unpackhi_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t)
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+
+      start += 16;
+      filter_values += 4;
+    }
+
+    int r = filter_length & 3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8);
+      __m128i coeff;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+
+      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      /* c1 c1 c1 c1 c0 c0 c0 c0 */
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum0 = _mm_packs_epi32(accum0, zero);
+    accum0 = _mm_packus_epi16(accum0, zero);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_packs_epi32(accum1, zero);
+    accum1 = _mm_packus_epi16(accum1, zero);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_packs_epi32(accum2, zero);
+    accum2 = _mm_packus_epi16(accum2, zero);
+    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_packs_epi32(accum3, zero);
+    accum3 = _mm_packus_epi16(accum3, zero);
+
+    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+    out_row[0] += 4;
+    out_row[1] += 4;
+    out_row[2] += 4;
+    out_row[3] += 4;
+  }
+#endif
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
+                             int filter_length,
+                             unsigned char* const* source_data_rows,
+                             int pixel_width,
+                             unsigned char* out_row) {
+#if defined(SIMD_SSE2)
+  int width = pixel_width & ~3;
+
+  __m128i zero = _mm_setzero_si128();
+  __m128i accum0, accum1, accum2, accum3, coeff16;
+  const __m128i* src;
+  // Output four pixels per iteration (16 bytes).
+  for (int out_x = 0; out_x < width; out_x += 4) {
+
+    // Accumulated result for each pixel. 32 bits per RGBA channel.
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    accum3 = _mm_setzero_si128();
+
+    // Convolve with one filter coefficient per iteration.
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+      // Duplicate the filter coefficient 8 times.
+      // [16] cj cj cj cj cj cj cj cj
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+      // Load four pixels (16 bytes) together.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][out_x << 2]);
+      __m128i src8 = _mm_loadu_si128(src);
+
+      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+
+      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+      // [32] a3 b3 g3 r3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum3 = _mm_add_epi32(accum3, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, accum3);
+
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+
+    if (has_alpha) {
+      // Compute the max(ri, gi, bi) for each pixel.
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+
+      // Make sure the value of alpha channel is always larger than maximum
+      // value of color channels.
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      // Set value of alpha channels to 0xFF.
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    // Store the convolution result (16 bytes) and advance the pixel pointers.
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+    out_row += 16;
+  }
+
+  // When the width of the output is not divisible by 4, We need to save one
+  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+  if (pixel_width & 3) {
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][width<<2]);
+      __m128i src8 = _mm_loadu_si128(src);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, zero);
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+    if (has_alpha) {
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    for (int out_x = width; out_x < pixel_width; out_x++) {
+      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+      accum0 = _mm_srli_si128(accum0, 4);
+      out_row += 4;
+    }
+  }
+#endif
+}
+
+}  // namespace
+
+// ConvolutionFilter1D ---------------------------------------------------------
+
+ConvolutionFilter1D::ConvolutionFilter1D()
+    : max_filter_(0) {
+}
+
+ConvolutionFilter1D::~ConvolutionFilter1D() {
+}
+
+void ConvolutionFilter1D::AddFilter(int filter_offset,
+                                    const float* filter_values,
+                                    int filter_length) {
+  SkASSERT(filter_length > 0);
+
+  std::vector<Fixed> fixed_values;
+  fixed_values.reserve(filter_length);
+
+  for (int i = 0; i < filter_length; ++i)
+    fixed_values.push_back(FloatToFixed(filter_values[i]));
+
+  AddFilter(filter_offset, &fixed_values[0], filter_length);
+}
+
+void ConvolutionFilter1D::AddFilter(int filter_offset,
+                                    const Fixed* filter_values,
+                                    int filter_length) {
+  // It is common for leading/trailing filter values to be zeros. In such
+  // cases it is beneficial to only store the central factors.
+  // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
+  // a 1080p image this optimization gives a ~10% speed improvement.
+  int first_non_zero = 0;
+  while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
+    first_non_zero++;
+
+  if (first_non_zero < filter_length) {
+    // Here we have at least one non-zero factor.
+    int last_non_zero = filter_length - 1;
+    while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
+      last_non_zero--;
+
+    filter_offset += first_non_zero;
+    filter_length = last_non_zero + 1 - first_non_zero;
+    SkASSERT(filter_length > 0);
+
+    for (int i = first_non_zero; i <= last_non_zero; i++)
+      filter_values_.push_back(filter_values[i]);
+  } else {
+    // Here all the factors were zeroes.
+    filter_length = 0;
+  }
+
+  FilterInstance instance;
+
+  // We pushed filter_length elements onto filter_values_
+  instance.data_location = (static_cast<int>(filter_values_.size()) -
+                            filter_length);
+  instance.offset = filter_offset;
+  instance.length = filter_length;
+  filters_.push_back(instance);
+
+  max_filter_ = NS_MAX(max_filter_, filter_length);
+}
+
+void BGRAConvolve2D(const unsigned char* source_data,
+                    int source_byte_row_stride,
+                    bool source_has_alpha,
+                    const ConvolutionFilter1D& filter_x,
+                    const ConvolutionFilter1D& filter_y,
+                    int output_byte_row_stride,
+                    unsigned char* output,
+                    bool use_sse2) {
+#if !defined(SIMD_SSE2)
+  // Even we have runtime support for SSE2 instructions, since the binary
+  // was not built with SSE2 support, we had to fallback to C version.
+  use_sse2 = false;
+#endif
+
+  int max_y_filter_size = filter_y.max_filter();
+
+  // The next row in the input that we will generate a horizontally
+  // convolved row for. If the filter doesn't start at the beginning of the
+  // image (this is the case when we are only resizing a subset), then we
+  // don't want to generate any output rows before that. Compute the starting
+  // row for convolution as the first pixel for the first vertical filter.
+  int filter_offset, filter_length;
+  const ConvolutionFilter1D::Fixed* filter_values =
+      filter_y.FilterForValue(0, &filter_offset, &filter_length);
+  int next_x_row = filter_offset;
+
+  // We loop over each row in the input doing a horizontal convolution. This
+  // will result in a horizontally convolved image. We write the results into
+  // a circular buffer of convolved rows and do vertical convolution as rows
+  // are available. This prevents us from having to store the entire
+  // intermediate image and helps cache coherency.
+  // We will need four extra rows to allow horizontal convolution could be done
+  // simultaneously. We also padding each row in row buffer to be aligned-up to
+  // 16 bytes.
+  // TODO(jiesun): We do not use aligned load from row buffer in vertical
+  // convolution pass yet. Somehow Windows does not like it.
+  int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
+  int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);
+  CircularRowBuffer row_buffer(row_buffer_width,
+                               row_buffer_height,
+                               filter_offset);
+
+  // Loop over every possible output row, processing just enough horizontal
+  // convolutions to run each subsequent vertical convolution.
+  SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
+  int num_output_rows = filter_y.num_values();
+
+  // We need to check which is the last line to convolve before we advance 4
+  // lines in one iteration.
+  int last_filter_offset, last_filter_length;
+  filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
+                          &last_filter_length);
+
+  for (int out_y = 0; out_y < num_output_rows; out_y++) {
+    filter_values = filter_y.FilterForValue(out_y,
+                                            &filter_offset, &filter_length);
+
+    // Generate output rows until we have enough to run the current filter.
+    if (use_sse2) {
+      while (next_x_row < filter_offset + filter_length) {
+        if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
+          const unsigned char* src[4];
+          unsigned char* out_row[4];
+          for (int i = 0; i < 4; ++i) {
+            src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
+            out_row[i] = row_buffer.AdvanceRow();
+          }
+          ConvolveHorizontally4_SSE2(src, filter_x, out_row);
+          next_x_row += 4;
+        } else {
+          // For the last row, SSE2 load possibly to access data beyond the
+          // image area. therefore we use C version here. 
+          if (next_x_row == last_filter_offset + last_filter_length - 1) {
+            if (source_has_alpha) {
+              ConvolveHorizontally<true>(
+                  &source_data[next_x_row * source_byte_row_stride],
+                  filter_x, row_buffer.AdvanceRow());
+            } else {
+              ConvolveHorizontally<false>(
+                  &source_data[next_x_row * source_byte_row_stride],
+                  filter_x, row_buffer.AdvanceRow());
+            }
+          } else {
+            ConvolveHorizontally_SSE2(
+                &source_data[next_x_row * source_byte_row_stride],
+                filter_x, row_buffer.AdvanceRow());
+          }
+          next_x_row++;
+        }
+      }
+    } else {
+      while (next_x_row < filter_offset + filter_length) {
+        if (source_has_alpha) {
+          ConvolveHorizontally<true>(
+              &source_data[next_x_row * source_byte_row_stride],
+              filter_x, row_buffer.AdvanceRow());
+        } else {
+          ConvolveHorizontally<false>(
+              &source_data[next_x_row * source_byte_row_stride],
+              filter_x, row_buffer.AdvanceRow());
+        }
+        next_x_row++;
+      }
+    }
+
+    // Compute where in the output image this row of final data will go.
+    unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
+
+    // Get the list of rows that the circular buffer has, in order.
+    int first_row_in_circular_buffer;
+    unsigned char* const* rows_to_convolve =
+        row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
+
+    // Now compute the start of the subset of those rows that the filter
+    // needs.
+    unsigned char* const* first_row_for_filter =
+        &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
+
+    if (source_has_alpha) {
+      if (use_sse2) {
+        ConvolveVertically_SSE2<true>(filter_values, filter_length,
+                                      first_row_for_filter,
+                                      filter_x.num_values(), cur_output_row);
+      } else {
+        ConvolveVertically<true>(filter_values, filter_length,
+                                 first_row_for_filter,
+                                 filter_x.num_values(), cur_output_row);
+      }
+    } else {
+      if (use_sse2) {
+        ConvolveVertically_SSE2<false>(filter_values, filter_length,
+                                       first_row_for_filter,
+                                       filter_x.num_values(), cur_output_row);
+      } else {
+        ConvolveVertically<false>(filter_values, filter_length,
+                                 first_row_for_filter,
+                                 filter_x.num_values(), cur_output_row);
+      }
+    }
+  }
+}
+
+}  // namespace skia
new file mode 100644
--- /dev/null
+++ b/gfx/2d/convolver.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SKIA_EXT_CONVOLVER_H_
+#define SKIA_EXT_CONVOLVER_H_
+
+#include <cmath>
+#include <vector>
+
+#include "base/basictypes.h"
+#include "prtypes.h"
+#include "base/cpu.h"
+#include "skia/SkTypes.h"
+
+// avoid confusion with Mac OS X's math library (Carbon)
+#if defined(__APPLE__)
+#undef FloatToFixed
+#undef FixedToFloat
+#endif
+
+namespace skia {
+
+// Represents a filter in one dimension. Each output pixel has one entry in this
+// object for the filter values contributing to it. You build up the filter
+// list by calling AddFilter for each output pixel (in order).
+//
+// We do 2-dimensional convolution by first convolving each row by one
+// ConvolutionFilter1D, then convolving each column by another one.
+//
+// Entries are stored in fixed point, shifted left by kShiftBits.
+class ConvolutionFilter1D {
+ public:
+  typedef short Fixed;
+
+  // The number of bits that fixed point values are shifted by.
+  enum { kShiftBits = 14 };
+
+  ConvolutionFilter1D();
+  ~ConvolutionFilter1D();
+
+  // Convert between floating point and our fixed point representation.
+  static Fixed FloatToFixed(float f) {
+    return static_cast<Fixed>(f * (1 << kShiftBits));
+  }
+  static unsigned char FixedToChar(Fixed x) {
+    return static_cast<unsigned char>(x >> kShiftBits);
+  }
+  static float FixedToFloat(Fixed x) {
+    // The cast relies on Fixed being a short, implying that on
+    // the platforms we care about all (16) bits will fit into
+    // the mantissa of a (32-bit) float.
+    COMPILE_ASSERT(sizeof(Fixed) == 2, fixed_type_should_fit_in_float_mantissa);
+    float raw = static_cast<float>(x);
+    return ldexpf(raw, -kShiftBits);
+  }
+
+  // Returns the maximum pixel span of a filter.
+  int max_filter() const { return max_filter_; }
+
+  // Returns the number of filters in this filter. This is the dimension of the
+  // output image.
+  int num_values() const { return static_cast<int>(filters_.size()); }
+
+  // Appends the given list of scaling values for generating a given output
+  // pixel. |filter_offset| is the distance from the edge of the image to where
+  // the scaling factors start. The scaling factors apply to the source pixels
+  // starting from this position, and going for the next |filter_length| pixels.
+  //
+  // You will probably want to make sure your input is normalized (that is,
+  // all entries in |filter_values| sub to one) to prevent affecting the overall
+  // brighness of the image.
+  //
+  // The filter_length must be > 0.
+  //
+  // This version will automatically convert your input to fixed point.
+  void AddFilter(int filter_offset,
+                        const float* filter_values,
+                        int filter_length);
+
+  // Same as the above version, but the input is already fixed point.
+  void AddFilter(int filter_offset,
+                 const Fixed* filter_values,
+                 int filter_length);
+
+  // Retrieves a filter for the given |value_offset|, a position in the output
+  // image in the direction we're convolving. The offset and length of the
+  // filter values are put into the corresponding out arguments (see AddFilter
+  // above for what these mean), and a pointer to the first scaling factor is
+  // returned. There will be |filter_length| values in this array.
+  inline const Fixed* FilterForValue(int value_offset,
+                                     int* filter_offset,
+                                     int* filter_length) const {
+    const FilterInstance& filter = filters_[value_offset];
+    *filter_offset = filter.offset;
+    *filter_length = filter.length;
+    if (filter.length == 0) {
+      return NULL;
+    }
+    return &filter_values_[filter.data_location];
+  }
+
+
+  inline void PaddingForSIMD(int padding_count) {
+    // Padding |padding_count| of more dummy coefficients after the coefficients
+    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
+    // together to access invalid memory areas. We are not trying to align the
+    // coefficients right now due to the opaqueness of <vector> implementation.
+    // This has to be done after all |AddFilter| calls.
+    for (int i = 0; i < padding_count; ++i)
+      filter_values_.push_back(static_cast<Fixed>(0));
+  }
+
+ private:
+  struct FilterInstance {
+    // Offset within filter_values for this instance of the filter.
+    int data_location;
+
+    // Distance from the left of the filter to the center. IN PIXELS
+    int offset;
+
+    // Number of values in this filter instance.
+    int length;
+  };
+
+  // Stores the information for each filter added to this class.
+  std::vector<FilterInstance> filters_;
+
+  // We store all the filter values in this flat list, indexed by
+  // |FilterInstance.data_location| to avoid the mallocs required for storing
+  // each one separately.
+  std::vector<Fixed> filter_values_;
+
+  // The maximum size of any filter we've added.
+  int max_filter_;
+};
+
+// Does a two-dimensional convolution on the given source image.
+//
+// It is assumed the source pixel offsets referenced in the input filters
+// reference only valid pixels, so the source image size is not required. Each
+// row of the source image starts |source_byte_row_stride| after the previous
+// one (this allows you to have rows with some padding at the end).
+//
+// The result will be put into the given output buffer. The destination image
+// size will be xfilter.num_values() * yfilter.num_values() pixels. It will be
+// in rows of exactly xfilter.num_values() * 4 bytes.
+//
+// |source_has_alpha| is a hint that allows us to avoid doing computations on
+// the alpha channel if the image is opaque. If you don't know, set this to
+// true and it will work properly, but setting this to false will be a few
+// percent faster if you know the image is opaque.
+//
+// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
+// (this is ARGB when loaded into 32-bit words on a little-endian machine).
+void BGRAConvolve2D(const unsigned char* source_data,
+                    int source_byte_row_stride,
+                    bool source_has_alpha,
+                    const ConvolutionFilter1D& xfilter,
+                    const ConvolutionFilter1D& yfilter,
+                    int output_byte_row_stride,
+                    unsigned char* output,
+                    bool use_sse2);
+}  // namespace skia
+
+#endif  // SKIA_EXT_CONVOLVER_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/image_operations.cpp
@@ -0,0 +1,536 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/basictypes.h"
+
+#define _USE_MATH_DEFINES
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "image_operations.h"
+
+#include "nsAlgorithm.h"
+#include "base/stack_container.h"
+#include "convolver.h"
+#include "skia/SkColorPriv.h"
+#include "skia/SkBitmap.h"
+#include "skia/SkRect.h"
+#include "skia/SkFontHost.h"
+
+namespace skia {
+
+namespace {
+
+// Returns the ceiling/floor as an integer.
+inline int CeilInt(float val) {
+  return static_cast<int>(ceil(val));
+}
+inline int FloorInt(float val) {
+  return static_cast<int>(floor(val));
+}
+
+// Filter function computation -------------------------------------------------
+
+// Evaluates the box filter, which goes from -0.5 to +0.5.
+float EvalBox(float x) {
+  return (x >= -0.5f && x < 0.5f) ? 1.0f : 0.0f;
+}
+
+// Evaluates the Lanczos filter of the given filter size window for the given
+// position.
+//
+// |filter_size| is the width of the filter (the "window"), outside of which
+// the value of the function is 0. Inside of the window, the value is the
+// normalized sinc function:
+//   lanczos(x) = sinc(x) * sinc(x / filter_size);
+// where
+//   sinc(x) = sin(pi*x) / (pi*x);
+float EvalLanczos(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -std::numeric_limits<float>::epsilon() &&
+      x < std::numeric_limits<float>::epsilon())
+    return 1.0f;  // Special case the discontinuity at the origin.
+  float xpi = x * static_cast<float>(M_PI);
+  return (sin(xpi) / xpi) *  // sinc(x)
+          sin(xpi / filter_size) / (xpi / filter_size);  // sinc(x/filter_size)
+}
+
+// Evaluates the Hamming filter of the given filter size window for the given
+// position.
+//
+// The filter covers [-filter_size, +filter_size]. Outside of this window
+// the value of the function is 0. Inside of the window, the value is sinus
+// cardinal multiplied by a recentered Hamming function. The traditional
+// Hamming formula for a window of size N and n ranging in [0, N-1] is:
+//   hamming(n) = 0.54 - 0.46 * cos(2 * pi * n / (N-1)))
+// In our case we want the function centered for x == 0 and at its minimum
+// on both ends of the window (x == +/- filter_size), hence the adjusted
+// formula:
+//   hamming(x) = (0.54 -
+//                 0.46 * cos(2 * pi * (x - filter_size)/ (2 * filter_size)))
+//              = 0.54 - 0.46 * cos(pi * x / filter_size - pi)
+//              = 0.54 + 0.46 * cos(pi * x / filter_size)
+float EvalHamming(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -std::numeric_limits<float>::epsilon() &&
+      x < std::numeric_limits<float>::epsilon())
+    return 1.0f;  // Special case the sinc discontinuity at the origin.
+  const float xpi = x * static_cast<float>(M_PI);
+
+  return ((sin(xpi) / xpi) *  // sinc(x)
+          (0.54f + 0.46f * cos(xpi / filter_size)));  // hamming(x)
+}
+
+// ResizeFilter ----------------------------------------------------------------
+
+// Encapsulates computation and storage of the filters required for one complete
+// resize operation.
+class ResizeFilter {
+ public:
+  ResizeFilter(ImageOperations::ResizeMethod method,
+               int src_full_width, int src_full_height,
+               int dest_width, int dest_height,
+               const SkIRect& dest_subset);
+
+  // Returns the filled filter values.
+  const ConvolutionFilter1D& x_filter() { return x_filter_; }
+  const ConvolutionFilter1D& y_filter() { return y_filter_; }
+
+ private:
+  // Returns the number of pixels that the filer spans, in filter space (the
+  // destination image).
+  float GetFilterSupport(float scale) {
+    switch (method_) {
+      case ImageOperations::RESIZE_BOX:
+        // The box filter just scales with the image scaling.
+        return 0.5f;  // Only want one side of the filter = /2.
+      case ImageOperations::RESIZE_HAMMING1:
+        // The Hamming filter takes as much space in the source image in
+        // each direction as the size of the window = 1 for Hamming1.
+        return 1.0f;
+      case ImageOperations::RESIZE_LANCZOS2:
+        // The Lanczos filter takes as much space in the source image in
+        // each direction as the size of the window = 2 for Lanczos2.
+        return 2.0f;
+      case ImageOperations::RESIZE_LANCZOS3:
+        // The Lanczos filter takes as much space in the source image in
+        // each direction as the size of the window = 3 for Lanczos3.
+        return 3.0f;
+      default:
+        return 1.0f;
+    }
+  }
+
+  // Computes one set of filters either horizontally or vertically. The caller
+  // will specify the "min" and "max" rather than the bottom/top and
+  // right/bottom so that the same code can be re-used in each dimension.
+  //
+  // |src_depend_lo| and |src_depend_size| gives the range for the source
+  // depend rectangle (horizontally or vertically at the caller's discretion
+  // -- see above for what this means).
+  //
+  // Likewise, the range of destination values to compute and the scale factor
+  // for the transform is also specified.
+  void ComputeFilters(int src_size,
+                      int dest_subset_lo, int dest_subset_size,
+                      float scale, float src_support,
+                      ConvolutionFilter1D* output);
+
+  // Computes the filter value given the coordinate in filter space.
+  inline float ComputeFilter(float pos) {
+    switch (method_) {
+      case ImageOperations::RESIZE_BOX:
+        return EvalBox(pos);
+      case ImageOperations::RESIZE_HAMMING1:
+        return EvalHamming(1, pos);
+      case ImageOperations::RESIZE_LANCZOS2:
+        return EvalLanczos(2, pos);
+      case ImageOperations::RESIZE_LANCZOS3:
+        return EvalLanczos(3, pos);
+      default:
+        return 0;
+    }
+  }
+
+  ImageOperations::ResizeMethod method_;
+
+  // Size of the filter support on one side only in the destination space.
+  // See GetFilterSupport.
+  float x_filter_support_;
+  float y_filter_support_;
+
+  // Subset of scaled destination bitmap to compute.
+  SkIRect out_bounds_;
+
+  ConvolutionFilter1D x_filter_;
+  ConvolutionFilter1D y_filter_;
+
+  DISALLOW_COPY_AND_ASSIGN(ResizeFilter);
+};
+
+ResizeFilter::ResizeFilter(ImageOperations::ResizeMethod method,
+                           int src_full_width, int src_full_height,
+                           int dest_width, int dest_height,
+                           const SkIRect& dest_subset)
+    : method_(method),
+      out_bounds_(dest_subset) {
+  // method_ will only ever refer to an "algorithm method".
+  SkASSERT((ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+           (method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD));
+
+  float scale_x = static_cast<float>(dest_width) /
+                  static_cast<float>(src_full_width);
+  float scale_y = static_cast<float>(dest_height) /
+                  static_cast<float>(src_full_height);
+
+  x_filter_support_ = GetFilterSupport(scale_x);
+  y_filter_support_ = GetFilterSupport(scale_y);
+
+  // Support of the filter in source space.
+  float src_x_support = x_filter_support_ / scale_x;
+  float src_y_support = y_filter_support_ / scale_y;
+
+  ComputeFilters(src_full_width, dest_subset.fLeft, dest_subset.width(),
+                 scale_x, src_x_support, &x_filter_);
+  ComputeFilters(src_full_height, dest_subset.fTop, dest_subset.height(),
+                 scale_y, src_y_support, &y_filter_);
+}
+
+// TODO(egouriou): Take advantage of periods in the convolution.
+// Practical resizing filters are periodic outside of the border area.
+// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
+// source become p pixels in the destination) will have a period of p.
+// A nice consequence is a period of 1 when downscaling by an integral
+// factor. Downscaling from typical display resolutions is also bound
+// to produce interesting periods as those are chosen to have multiple
+// small factors.
+// Small periods reduce computational load and improve cache usage if
+// the coefficients can be shared. For periods of 1 we can consider
+// loading the factors only once outside the borders.
+void ResizeFilter::ComputeFilters(int src_size,
+                                  int dest_subset_lo, int dest_subset_size,
+                                  float scale, float src_support,
+                                  ConvolutionFilter1D* output) {
+  int dest_subset_hi = dest_subset_lo + dest_subset_size;  // [lo, hi)
+
+  // When we're doing a magnification, the scale will be larger than one. This
+  // means the destination pixels are much smaller than the source pixels, and
+  // that the range covered by the filter won't necessarily cover any source
+  // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+  // some computations.
+  float clamped_scale = NS_MIN(1.0f, scale);
+
+  // Speed up the divisions below by turning them into multiplies.
+  float inv_scale = 1.0f / scale;
+
+  StackVector<float, 64> filter_values;
+  StackVector<int16_t, 64> fixed_filter_values;
+
+  // Loop over all pixels in the output range. We will generate one set of
+  // filter values for each one. Those values will tell us how to blend the
+  // source pixels to compute the destination pixel.
+  for (int dest_subset_i = dest_subset_lo; dest_subset_i < dest_subset_hi;
+       dest_subset_i++) {
+    // Reset the arrays. We don't declare them inside so they can re-use the
+    // same malloc-ed buffer.
+    filter_values->clear();
+    fixed_filter_values->clear();
+
+    // This is the pixel in the source directly under the pixel in the dest.
+    // Note that we base computations on the "center" of the pixels. To see
+    // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
+    // downscale should "cover" the pixels around the pixel with *its center*
+    // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
+    // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
+    // TODO(evannier): this code is therefore incorrect and should read:
+    // float src_pixel = (static_cast<float>(dest_subset_i) + 0.5f) * inv_scale;
+    // I leave it incorrect, because changing it would require modifying
+    // the results for the webkit test, which I will do in a subsequent checkin.
+    float src_pixel = dest_subset_i * inv_scale;
+
+    // Compute the (inclusive) range of source pixels the filter covers.
+    int src_begin = NS_MAX(0, FloorInt(src_pixel - src_support));
+    int src_end = NS_MIN(src_size - 1, CeilInt(src_pixel + src_support));
+
+    // Compute the unnormalized filter value at each location of the source
+    // it covers.
+    float filter_sum = 0.0f;  // Sub of the filter values for normalizing.
+    for (int cur_filter_pixel = src_begin; cur_filter_pixel <= src_end;
+         cur_filter_pixel++) {
+      // Distance from the center of the filter, this is the filter coordinate
+      // in source space. We also need to consider the center of the pixel
+      // when comparing distance against 'src_pixel'. In the 5x downscale
+      // example used above the distance from the center of the filter to
+      // the pixel with coordinates (2, 2) should be 0, because its center
+      // is at (2.5, 2.5).
+      // TODO(evannier): as above (in regards to the 0.5 pixel error),
+      // this code is incorrect, but is left it for the same reasons.
+      // float src_filter_dist =
+      //     ((static_cast<float>(cur_filter_pixel) + 0.5f) - src_pixel);
+      float src_filter_dist = cur_filter_pixel - src_pixel;
+
+      // Since the filter really exists in dest space, map it there.
+      float dest_filter_dist = src_filter_dist * clamped_scale;
+
+      // Compute the filter value at that location.
+      float filter_value = ComputeFilter(dest_filter_dist);
+      filter_values->push_back(filter_value);
+
+      filter_sum += filter_value;
+    }
+
+    // The filter must be normalized so that we don't affect the brightness of
+    // the image. Convert to normalized fixed point.
+    int16_t fixed_sum = 0;
+    for (size_t i = 0; i < filter_values->size(); i++) {
+      int16_t cur_fixed = output->FloatToFixed(filter_values[i] / filter_sum);
+      fixed_sum += cur_fixed;
+      fixed_filter_values->push_back(cur_fixed);
+    }
+
+    // The conversion to fixed point will leave some rounding errors, which
+    // we add back in to avoid affecting the brightness of the image. We
+    // arbitrarily add this to the center of the filter array (this won't always
+    // be the center of the filter function since it could get clipped on the
+    // edges, but it doesn't matter enough to worry about that case).
+    int16_t leftovers = output->FloatToFixed(1.0f) - fixed_sum;
+    fixed_filter_values[fixed_filter_values->size() / 2] += leftovers;
+
+    // Now it's ready to go.
+    output->AddFilter(src_begin, &fixed_filter_values[0],
+                      static_cast<int>(fixed_filter_values->size()));
+  }
+
+  output->PaddingForSIMD(8);
+}
+
+ImageOperations::ResizeMethod ResizeMethodToAlgorithmMethod(
+    ImageOperations::ResizeMethod method) {
+  // Convert any "Quality Method" into an "Algorithm Method"
+  if (method >= ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD &&
+      method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD) {
+    return method;
+  }
+  // The call to ImageOperationsGtv::Resize() above took care of
+  // GPU-acceleration in the cases where it is possible. So now we just
+  // pick the appropriate software method for each resize quality.
+  switch (method) {
+    // Users of RESIZE_GOOD are willing to trade a lot of quality to
+    // get speed, allowing the use of linear resampling to get hardware
+    // acceleration (SRB). Hence any of our "good" software filters
+    // will be acceptable, and we use the fastest one, Hamming-1.
+    case ImageOperations::RESIZE_GOOD:
+      // Users of RESIZE_BETTER are willing to trade some quality in order
+      // to improve performance, but are guaranteed not to devolve to a linear
+      // resampling. In visual tests we see that Hamming-1 is not as good as
+      // Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
+      // about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
+      // an acceptable trade-off between quality and speed.
+    case ImageOperations::RESIZE_BETTER:
+      return ImageOperations::RESIZE_HAMMING1;
+    default:
+      return ImageOperations::RESIZE_LANCZOS3;
+  }
+}
+
+}  // namespace
+
+// Resize ----------------------------------------------------------------------
+
+// static
+SkBitmap ImageOperations::Resize(const SkBitmap& source,
+                                 ResizeMethod method,
+                                 int dest_width, int dest_height,
+                                 const SkIRect& dest_subset,
+                                 void* dest_pixels /* = nullptr */) {
+  if (method == ImageOperations::RESIZE_SUBPIXEL)
+    return ResizeSubpixel(source, dest_width, dest_height, dest_subset);
+  else
+    return ResizeBasic(source, method, dest_width, dest_height, dest_subset,
+                       dest_pixels);
+}
+
+// static
+SkBitmap ImageOperations::ResizeSubpixel(const SkBitmap& source,
+                                         int dest_width, int dest_height,
+                                         const SkIRect& dest_subset) {
+  // Currently only works on Linux/BSD because these are the only platforms
+  // where SkFontHost::GetSubpixelOrder is defined.
+#if defined(XP_UNIX)
+  // Understand the display.
+  const SkFontHost::LCDOrder order = SkFontHost::GetSubpixelOrder();
+  const SkFontHost::LCDOrientation orientation =
+      SkFontHost::GetSubpixelOrientation();
+
+  // Decide on which dimension, if any, to deploy subpixel rendering.
+  int w = 1;
+  int h = 1;
+  switch (orientation) {
+    case SkFontHost::kHorizontal_LCDOrientation:
+      w = dest_width < source.width() ? 3 : 1;
+      break;
+    case SkFontHost::kVertical_LCDOrientation:
+      h = dest_height < source.height() ? 3 : 1;
+      break;
+  }
+
+  // Resize the image.
+  const int width = dest_width * w;
+  const int height = dest_height * h;
+  SkIRect subset = { dest_subset.fLeft, dest_subset.fTop,
+                     dest_subset.fLeft + dest_subset.width() * w,
+                     dest_subset.fTop + dest_subset.height() * h };
+  SkBitmap img = ResizeBasic(source, ImageOperations::RESIZE_LANCZOS3, width,
+                             height, subset);
+  const int row_words = img.rowBytes() / 4;
+  if (w == 1 && h == 1)
+    return img;
+
+  // Render into subpixels.
+  SkBitmap result;
+  result.setConfig(SkBitmap::kARGB_8888_Config, dest_subset.width(),
+                   dest_subset.height());
+  result.allocPixels();
+  if (!result.readyToDraw())
+    return img;
+
+  SkAutoLockPixels locker(img);
+  if (!img.readyToDraw())
+    return img;
+
+  uint32_t* src_row = img.getAddr32(0, 0);
+  uint32_t* dst_row = result.getAddr32(0, 0);
+  for (int y = 0; y < dest_subset.height(); y++) {
+    uint32_t* src = src_row;
+    uint32_t* dst = dst_row;
+    for (int x = 0; x < dest_subset.width(); x++, src += w, dst++) {
+      uint8_t r = 0, g = 0, b = 0, a = 0;
+      switch (order) {
+        case SkFontHost::kRGB_LCDOrder:
+          switch (orientation) {
+            case SkFontHost::kHorizontal_LCDOrientation:
+              r = SkGetPackedR32(src[0]);
+              g = SkGetPackedG32(src[1]);
+              b = SkGetPackedB32(src[2]);
+              a = SkGetPackedA32(src[1]);
+              break;
+            case SkFontHost::kVertical_LCDOrientation:
+              r = SkGetPackedR32(src[0 * row_words]);
+              g = SkGetPackedG32(src[1 * row_words]);
+              b = SkGetPackedB32(src[2 * row_words]);
+              a = SkGetPackedA32(src[1 * row_words]);
+              break;
+          }
+          break;
+        case SkFontHost::kBGR_LCDOrder:
+          switch (orientation) {
+            case SkFontHost::kHorizontal_LCDOrientation:
+              b = SkGetPackedB32(src[0]);
+              g = SkGetPackedG32(src[1]);
+              r = SkGetPackedR32(src[2]);
+              a = SkGetPackedA32(src[1]);
+              break;
+            case SkFontHost::kVertical_LCDOrientation:
+              b = SkGetPackedB32(src[0 * row_words]);
+              g = SkGetPackedG32(src[1 * row_words]);
+              r = SkGetPackedR32(src[2 * row_words]);
+              a = SkGetPackedA32(src[1 * row_words]);
+              break;
+          }
+          break;
+        case SkFontHost::kNONE_LCDOrder:
+          break;
+      }
+      // Premultiplied alpha is very fragile.
+      a = a > r ? a : r;
+      a = a > g ? a : g;
+      a = a > b ? a : b;
+      *dst = SkPackARGB32(a, r, g, b);
+    }
+    src_row += h * row_words;
+    dst_row += result.rowBytes() / 4;
+  }
+  result.setIsOpaque(img.isOpaque());
+  return result;
+#else
+  return SkBitmap();
+#endif  // OS_POSIX && !OS_MACOSX && !defined(OS_ANDROID)
+}
+
+// static
+SkBitmap ImageOperations::ResizeBasic(const SkBitmap& source,
+                                      ResizeMethod method,
+                                      int dest_width, int dest_height,
+                                      const SkIRect& dest_subset,
+                                      void* dest_pixels /* = nullptr */) {
+  // Ensure that the ResizeMethod enumeration is sound.
+  SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
+            (method <= RESIZE_LAST_QUALITY_METHOD)) ||
+           ((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+            (method <= RESIZE_LAST_ALGORITHM_METHOD)));
+
+  // If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
+  // return empty.
+  if (source.width() < 1 || source.height() < 1 ||
+      dest_width < 1 || dest_height < 1)
+    return SkBitmap();
+
+  method = ResizeMethodToAlgorithmMethod(method);
+  // Check that we deal with an "algorithm methods" from this point onward.
+  SkASSERT((ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+           (method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD));
+
+  SkAutoLockPixels locker(source);
+  if (!source.readyToDraw())
+      return SkBitmap();
+
+  ResizeFilter filter(method, source.width(), source.height(),
+                      dest_width, dest_height, dest_subset);
+
+  // Get a source bitmap encompassing this touched area. We construct the
+  // offsets and row strides such that it looks like a new bitmap, while
+  // referring to the old data.
+  const uint8_t* source_subset =
+      reinterpret_cast<const uint8_t*>(source.getPixels());
+
+  // Convolve into the result.
+  SkBitmap result;
+  result.setConfig(SkBitmap::kARGB_8888_Config,
+                   dest_subset.width(), dest_subset.height());
+
+  if (dest_pixels) {
+    result.setPixels(dest_pixels);
+  } else {
+    result.allocPixels();
+  }
+
+  if (!result.readyToDraw())
+    return SkBitmap();
+
+  BGRAConvolve2D(source_subset, static_cast<int>(source.rowBytes()),
+                 !source.isOpaque(), filter.x_filter(), filter.y_filter(),
+                 static_cast<int>(result.rowBytes()),
+                 static_cast<unsigned char*>(result.getPixels()),
+                 /* sse = */ false);
+
+  // Preserve the "opaque" flag for use as an optimization later.
+  result.setIsOpaque(source.isOpaque());
+
+  return result;
+}
+
+// static
+SkBitmap ImageOperations::Resize(const SkBitmap& source,
+                                 ResizeMethod method,
+                                 int dest_width, int dest_height,
+                                 void* dest_pixels /* = nullptr */) {
+  SkIRect dest_subset = { 0, 0, dest_width, dest_height };
+  return Resize(source, method, dest_width, dest_height, dest_subset,
+                dest_pixels);
+}
+
+}  // namespace skia
new file mode 100644
--- /dev/null
+++ b/gfx/2d/image_operations.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SKIA_EXT_IMAGE_OPERATIONS_H_
+#define SKIA_EXT_IMAGE_OPERATIONS_H_
+
+#include "skia/SkTypes.h"
+#include "Types.h"
+
+class SkBitmap;
+struct SkIRect;
+
+namespace skia {
+
+class ImageOperations {
+ public:
+  enum ResizeMethod {
+    //
+    // Quality Methods
+    //
+    // Those enumeration values express a desired quality/speed tradeoff.
+    // They are translated into an algorithm-specific method that depends
+    // on the capabilities (CPU, GPU) of the underlying platform.
+    // It is possible for all three methods to be mapped to the same
+    // algorithm on a given platform.
+
+    // Good quality resizing. Fastest resizing with acceptable visual quality.
+    // This is typically intended for use during interactive layouts
+    // where slower platforms may want to trade image quality for large
+    // increase in resizing performance.
+    //
+    // For example the resizing implementation may devolve to linear
+    // filtering if this enables GPU acceleration to be used.
+    //
+    // Note that the underlying resizing method may be determined
+    // on the fly based on the parameters for a given resize call.
+    // For example an implementation using a GPU-based linear filter
+    // in the common case may still use a higher-quality software-based
+    // filter in cases where using the GPU would actually be slower - due
+    // to too much latency - or impossible - due to image format or size
+    // constraints.
+    RESIZE_GOOD,
+
+    // Medium quality resizing. Close to high quality resizing (better
+    // than linear interpolation) with potentially some quality being
+    // traded-off for additional speed compared to RESIZE_BEST.
+    //
+    // This is intended, for example, for generation of large thumbnails
+    // (hundreds of pixels in each dimension) from large sources, where
+    // a linear filter would produce too many artifacts but where
+    // a RESIZE_HIGH might be too costly time-wise.
+    RESIZE_BETTER,
+
+    // High quality resizing. The algorithm is picked to favor image quality.
+    RESIZE_BEST,
+
+    //
+    // Algorithm-specific enumerations
+    //
+
+    // Box filter. This is a weighted average of all of the pixels touching
+    // the destination pixel. For enlargement, this is nearest neighbor.
+    //
+    // You probably don't want this, it is here for testing since it is easy to
+    // compute. Use RESIZE_LANCZOS3 instead.
+    RESIZE_BOX,
+
+    // 1-cycle Hamming filter. This is tall is the middle and falls off towards
+    // the window edges but without going to 0. This is about 40% faster than
+    // a 2-cycle Lanczos.
+    RESIZE_HAMMING1,
+
+    // 2-cycle Lanczos filter. This is tall in the middle, goes negative on
+    // each side, then returns to zero. Does not provide as good a frequency
+    // response as a 3-cycle Lanczos but is roughly 30% faster.
+    RESIZE_LANCZOS2,
+
+    // 3-cycle Lanczos filter. This is tall in the middle, goes negative on
+    // each side, then oscillates 2 more times. It gives nice sharp edges.
+    RESIZE_LANCZOS3,
+
+    // Lanczos filter + subpixel interpolation. If subpixel rendering is not
+    // appropriate we automatically fall back to Lanczos.
+    RESIZE_SUBPIXEL,
+
+    // enum aliases for first and last methods by algorithm or by quality.
+    RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
+    RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
+    RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
+    RESIZE_LAST_ALGORITHM_METHOD = RESIZE_SUBPIXEL,
+  };
+
+  // Resizes the given source bitmap using the specified resize method, so that
+  // the entire image is (dest_size) big. The dest_subset is the rectangle in
+  // this destination image that should actually be returned.
+  //
+  // The output image will be (dest_subset.width(), dest_subset.height()). This
+  // will save work if you do not need the entire bitmap.
+  //
+  // The destination subset must be smaller than the destination image.
+  static SkBitmap Resize(const SkBitmap& source,
+                         ResizeMethod method,
+                         int dest_width, int dest_height,
+                         const SkIRect& dest_subset,
+                         void* dest_pixels = nullptr);
+
+  // Alternate version for resizing and returning the entire bitmap rather than
+  // a subset.
+  static SkBitmap Resize(const SkBitmap& source,
+                         ResizeMethod method,
+                         int dest_width, int dest_height,
+                         void* dest_pixels = nullptr);
+
+ private:
+  ImageOperations();  // Class for scoping only.
+
+  // Supports all methods except RESIZE_SUBPIXEL.
+  static SkBitmap ResizeBasic(const SkBitmap& source,
+                              ResizeMethod method,
+                              int dest_width, int dest_height,
+                              const SkIRect& dest_subset,
+                              void* dest_pixels = nullptr);
+
+  // Subpixel renderer.
+  static SkBitmap ResizeSubpixel(const SkBitmap& source,
+                                 int dest_width, int dest_height,
+                                 const SkIRect& dest_subset);
+};
+
+}  // namespace skia
+
+#endif  // SKIA_EXT_IMAGE_OPERATIONS_H_