Bug 486918. Part 1: Import Chromium's higher-quality image scalers, since we know those to be good and shippable. r=jrmuizel
authorJoe Drew <joe@drew.ca>
Thu, 23 Aug 2012 15:36:04 -0400
changeset 108668 a59944cc3781
parent 108667 bed067351ab6
child 108669 27e0c22b96e5
push id15624
push userphilringnalda@gmail.com
push dateSun, 30 Sep 2012 05:00:09 +0000
treeherdermozilla-inbound@aaf9e3020132 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs486918
milestone18.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 486918. Part 1: Import Chromium's higher-quality image scalers, since we know those to be good and shippable. r=jrmuizel
gfx/2d/HelpersSkia.h
gfx/2d/Makefile.in
gfx/2d/Scale.cpp
gfx/2d/Scale.h
gfx/2d/basictypes.h
gfx/2d/convolver.cpp
gfx/2d/convolver.h
gfx/2d/cpu.h
gfx/2d/image_operations.cpp
gfx/2d/image_operations.h
gfx/2d/port.h
gfx/2d/stack_container.h
--- a/gfx/2d/HelpersSkia.h
+++ b/gfx/2d/HelpersSkia.h
@@ -5,16 +5,17 @@
 
 #ifndef MOZILLA_GFX_HELPERSSKIA_H_
 #define MOZILLA_GFX_HELPERSSKIA_H_
 
 #include "2D.h"
 #include "skia/SkCanvas.h"
 #include "skia/SkDashPathEffect.h"
 #include "mozilla/Assertions.h"
+#include <vector>
 
 namespace mozilla {
 namespace gfx {
 
 static inline SkBitmap::Config
 GfxFormatToSkiaConfig(SurfaceFormat format)
 {
   switch (format)
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@@ -24,16 +24,17 @@ EXPORTS_mozilla/gfx	= \
         BaseMargin.h \
         BaseRect.h \
         BaseSize.h \
         Blur.h \
         PathHelpers.h \
         Point.h \
         Matrix.h \
         Rect.h \
+        Scale.h \
         Types.h \
         Tools.h \
         UserData.h \
 	$(NULL)
 
 CPPSRCS	= \
         Factory.cpp \
         Rect.cpp \
@@ -41,16 +42,17 @@ CPPSRCS	= \
         DrawTargetCairo.cpp \
         SourceSurfaceCairo.cpp \
         PathCairo.cpp \
         DrawTargetRecording.cpp \
         PathRecording.cpp \
         RecordedEvent.cpp \
         DrawEventRecorder.cpp \
         Blur.cpp \
+        Scale.cpp \
         ScaledFontBase.cpp \
         DrawTargetDual.cpp \
         ImageScaling.cpp \
         SourceSurfaceRawData.cpp \
         $(NULL)
 
 ifeq (cocoa,$(MOZ_WIDGET_TOOLKIT))
 CPPSRCS	+= \
@@ -71,16 +73,18 @@ endif
 
 DEFINES += -DMOZ_GFX -DUSE_CAIRO -DGFX2D_INTERNAL
 
 ifdef MOZ_ENABLE_SKIA
 CPPSRCS	+= \
         SourceSurfaceSkia.cpp \
         DrawTargetSkia.cpp \
         PathSkia.cpp \
+        convolver.cpp \
+        image_operations.cpp \
         $(NULL)
 
 DEFINES += -DUSE_SKIA
 
 endif
 
 ifeq (cocoa,$(MOZ_WIDGET_TOOLKIT))
 ifdef MOZ_ENABLE_SKIA
new file mode 100644
--- /dev/null
+++ b/gfx/2d/Scale.cpp
@@ -0,0 +1,54 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Scale.h"
+
+#ifdef USE_SKIA
+#include "HelpersSkia.h"
+#include "skia/SkBitmap.h"
+#include "image_operations.h"
+#endif
+
+namespace mozilla {
+namespace gfx {
+
+bool Scale(uint8_t* srcData, int32_t srcWidth, int32_t srcHeight, int32_t srcStride,
+           uint8_t* dstData, int32_t dstWidth, int32_t dstHeight, int32_t dstStride,
+           SurfaceFormat format)
+{
+#ifdef USE_SKIA
+  bool opaque;
+  if (format == FORMAT_B8G8R8A8) {
+    opaque = false;
+  } else {
+    opaque = true;
+  }
+
+  SkBitmap::Config config = GfxFormatToSkiaConfig(format);
+
+  SkBitmap imgSrc;
+  imgSrc.setConfig(config, srcWidth, srcHeight, srcStride);
+  imgSrc.setPixels(srcData);
+  imgSrc.setIsOpaque(opaque);
+
+  // Rescaler is compatible with 32 bpp only. Convert to RGB32 if needed.
+  if (config != SkBitmap::kARGB_8888_Config) {
+    imgSrc.copyTo(&imgSrc, SkBitmap::kARGB_8888_Config);
+  }
+
+  // This returns an SkBitmap backed by dstData; since it also wrote to dstData,
+  // we don't need to look at that SkBitmap.
+  SkBitmap result = skia::ImageOperations::Resize(imgSrc,
+                                                  skia::ImageOperations::RESIZE_BEST,
+                                                  dstWidth, dstHeight,
+                                                  dstData);
+
+  return result.readyToDraw();
+#else
+  return false;
+#endif
+}
+
+}
+}
new file mode 100644
--- /dev/null
+++ b/gfx/2d/Scale.h
@@ -0,0 +1,36 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_GFX_SCALE_H_
+#define MOZILLA_GFX_SCALE_H_
+
+#include "Types.h"
+
+namespace mozilla {
+namespace gfx {
+
+/**
+ * Scale an image using a high-quality filter.
+ *
+ * Synchronously scales an image and writes the output to the destination in
+ * 32-bit format. The destination must be pre-allocated by the caller.
+ *
+ * Returns true if scaling was successful, and false otherwise. Currently, this
+ * function is implemented using Skia. If Skia is not enabled when building,
+ * calling this function will always return false.
+ *
+ * IMPLEMTATION NOTES:
+ * This API is not currently easily hardware acceleratable. A better API might
+ * take a SourceSurface and return a SourceSurface; the Direct2D backend, for
+ * example, could simply set a status bit on a copy of the image, and use
+ * Direct2D's high-quality scaler at draw time.
+ */
+GFX2D_API bool Scale(uint8_t* srcData, int32_t srcWidth, int32_t srcHeight, int32_t srcStride,
+                     uint8_t* dstData, int32_t dstWidth, int32_t dstHeight, int32_t dstStride,
+                     SurfaceFormat format);
+
+}
+}
+
+#endif /* MOZILLA_GFX_BLUR_H_ */
new file mode 100644
--- /dev/null
+++ b/gfx/2d/basictypes.h
@@ -0,0 +1,357 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_BASICTYPES_H_
+#define BASE_BASICTYPES_H_
+
+// Chromium includes a prtypes.h also, but it has been modified to include
+// their build_config.h as well. We can therefore test for both to determine
+// if someone screws up the include order.
+#if defined(prtypes_h___) && !defined(BUILD_BUILD_CONFIG_H_)
+#error You_must_include_basictypes.h_before_prtypes.h!
+#endif
+
+#ifndef NO_NSPR_10_SUPPORT
+#define NO_NSPR_10_SUPPORT
+#define NO_NSPR_10_SUPPORT_SAVE
+#endif
+
+
+#ifdef NO_NSPR_10_SUPPORT_SAVE
+#undef NO_NSPR_10_SUPPORT_SAVE
+#undef NO_NSPR_10_SUPPORT
+#endif
+
+#ifdef _WIN32
+#undef _WIN32
+#define _WIN32_SAVE
+#endif
+
+
+#ifdef _WIN32_SAVE
+#undef _WIN32_SAVE
+#define _WIN32
+#endif
+
+#include <limits.h>         // So we can set the bounds of our types
+#include <stddef.h>         // For size_t
+#include <string.h>         // for memcpy
+
+//#include "base/port.h"    // Types that only need exist on certain systems
+
+#ifndef COMPILER_MSVC
+// stdint.h is part of C99 but MSVC doesn't have it.
+#include <stdint.h>         // For intptr_t.
+#endif
+typedef uint8_t uint8;
+typedef int16_t int16;
+#if 0
+// A type to represent a Unicode code-point value. As of Unicode 4.0,
+// such values require up to 21 bits.
+// (For type-checking on pointers, make this explicitly signed,
+// and it should always be the signed version of whatever int32 is.)
+typedef signed int         char32;
+
+const uint8  kuint8max  = (( uint8) 0xFF);
+const uint16 kuint16max = ((uint16) 0xFFFF);
+const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
+const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF));
+const  int8  kint8min   = ((  int8) 0x80);
+const  int8  kint8max   = ((  int8) 0x7F);
+const  int16 kint16min  = (( int16) 0x8000);
+const  int16 kint16max  = (( int16) 0x7FFF);
+const  int32 kint32min  = (( int32) 0x80000000);
+const  int32 kint32max  = (( int32) 0x7FFFFFFF);
+const  int64 kint64min  = (( int64) GG_LONGLONG(0x8000000000000000));
+const  int64 kint64max  = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF));
+#endif
+// Platform- and hardware-dependent printf specifiers
+#  if defined(OS_POSIX)
+#    define __STDC_FORMAT_MACROS 1
+#    include <inttypes.h>           // for 64-bit integer format macros
+#    define PRId64L "I64d"
+#    define PRIu64L "I64u"
+#    define PRIx64L "I64x"
+#  elif defined(OS_WIN)
+#    define PRId64 "I64d"
+#    define PRIu64 "I64u"
+#    define PRIx64 "I64x"
+#    define PRId64L L"I64d"
+#    define PRIu64L L"I64u"
+#    define PRIx64L L"I64x"
+#  endif
+
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&)
+
+// An older, deprecated, politically incorrect name for the above.
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// A macro to disallow all the implicit constructors, namely the
+// default constructor, copy constructor and operator= functions.
+//
+// This should be used in the private: declarations for a class
+// that wants to prevent anyone from instantiating it. This is
+// especially useful for classes containing only static methods.
+#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+  TypeName();                                    \
+  DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+// One caveat is that arraysize() doesn't accept any array of an
+// anonymous type or a type defined inside a function.  In these rare
+// cases, you have to use the unsafe ARRAYSIZE_UNSAFE() macro below.  This is
+// due to a limitation in C++'s template system.  The limitation might
+// eventually be removed, but it hasn't happened yet.
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef _MSC_VER
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// ARRAYSIZE_UNSAFE performs essentially the same calculation as arraysize,
+// but can be used on anonymous types or types defined inside
+// functions.  It's less safe than arraysize as it accepts some
+// (although not all) pointers.  Therefore, you should use arraysize
+// whenever possible.
+//
+// The expression ARRAYSIZE_UNSAFE(a) is a compile-time constant of type
+// size_t.
+//
+// ARRAYSIZE_UNSAFE catches a few type errors.  If you see a compiler error
+//
+//   "warning: division by zero in ..."
+//
+// when using ARRAYSIZE_UNSAFE, you are (wrongfully) giving it a pointer.
+// You should only use ARRAYSIZE_UNSAFE on statically allocated arrays.
+//
+// The following comments are on the implementation details, and can
+// be ignored by the users.
+//
+// ARRAYSIZE_UNSAFE(arr) works by inspecting sizeof(arr) (the # of bytes in
+// the array) and sizeof(*(arr)) (the # of bytes in one array
+// element).  If the former is divisible by the latter, perhaps arr is
+// indeed an array, in which case the division result is the # of
+// elements in the array.  Otherwise, arr cannot possibly be an array,
+// and we generate a compiler error to prevent the code from
+// compiling.
+//
+// Since the size of bool is implementation-defined, we need to cast
+// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
+// result has type size_t.
+//
+// This macro is not perfect as it wrongfully accepts certain
+// pointers, namely where the pointer size is divisible by the pointee
+// size.  Since all our code has to go through a 32-bit compiler,
+// where a pointer is 4 bytes, this means all pointers to a type whose
+// size is 3 or greater than 4 will be (righteously) rejected.
+
+#define ARRAYSIZE_UNSAFE(a) \
+  ((sizeof(a) / sizeof(*(a))) / \
+   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+
+
+// Use implicit_cast as a safe version of static_cast or const_cast
+// for upcasting in the type hierarchy (i.e. casting a pointer to Foo
+// to a pointer to SuperclassOfFoo or casting a pointer to Foo to
+// a const pointer to Foo).
+// When you use implicit_cast, the compiler checks that the cast is safe.
+// Such explicit implicit_casts are necessary in surprisingly many
+// situations where C++ demands an exact type match instead of an
+// argument type convertable to a target type.
+//
+// The From type can be inferred, so the preferred syntax for using
+// implicit_cast is the same as for static_cast etc.:
+//
+//   implicit_cast<ToType>(expr)
+//
+// implicit_cast would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+template<typename To, typename From>
+inline To implicit_cast(From const &f) {
+  return f;
+}
+
+// The COMPILE_ASSERT macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   COMPILE_ASSERT(ARRAYSIZE_UNSAFE(content_type_names) == CONTENT_NUM_TYPES,
+//                  content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#undef COMPILE_ASSERT
+#define COMPILE_ASSERT(expr, msg) \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// Implementation details of COMPILE_ASSERT:
+//
+// - COMPILE_ASSERT works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//     #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
+//                               // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     COMPILE_ASSERT(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+
+// MetatagId refers to metatag-id that we assign to
+// each metatag <name, value> pair..
+//typedef uint32 MetatagId;
+
+// Argument type used in interfaces that can optionally take ownership
+// of a passed in argument.  If TAKE_OWNERSHIP is passed, the called
+// object takes ownership of the argument.  Otherwise it does not.
+enum Ownership {
+  DO_NOT_TAKE_OWNERSHIP,
+  TAKE_OWNERSHIP
+};
+
+// bit_cast<Dest,Source> is a template function that implements the
+// equivalent of "*reinterpret_cast<Dest*>(&source)".  We need this in
+// very low-level functions like the protobuf library and fast math
+// support.
+//
+//   float f = 3.14159265358979;
+//   int i = bit_cast<int32>(f);
+//   // i = 0x40490fdb
+//
+// The classical address-casting method is:
+//
+//   // WRONG
+//   float f = 3.14159265358979;            // WRONG
+//   int i = * reinterpret_cast<int*>(&f);  // WRONG
+//
+// The address-casting method actually produces undefined behavior
+// according to ISO C++ specification section 3.10 -15 -.  Roughly, this
+// section says: if an object in memory has one type, and a program
+// accesses it with a different type, then the result is undefined
+// behavior for most values of "different type".
+//
+// This is true for any cast syntax, either *(int*)&f or
+// *reinterpret_cast<int*>(&f).  And it is particularly true for
+// conversions betweeen integral lvalues and floating-point lvalues.
+//
+// The purpose of 3.10 -15- is to allow optimizing compilers to assume
+// that expressions with different types refer to different memory.  gcc
+// 4.0.1 has an optimizer that takes advantage of this.  So a
+// non-conforming program quietly produces wildly incorrect output.
+//
+// The problem is not the use of reinterpret_cast.  The problem is type
+// punning: holding an object in memory of one type and reading its bits
+// back using a different type.
+//
+// The C++ standard is more subtle and complex than this, but that
+// is the basic idea.
+//
+// Anyways ...
+//
+// bit_cast<> calls memcpy() which is blessed by the standard,
+// especially by the example in section 3.9 .  Also, of course,
+// bit_cast<> wraps up the nasty logic in one place.
+//
+// Fortunately memcpy() is very fast.  In optimized mode, with a
+// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
+// code with the minimal amount of data movement.  On a 32-bit system,
+// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
+// compiles to two loads and two stores.
+//
+// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
+//
+// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
+// is likely to surprise you.
+
+template <class Dest, class Source>
+inline Dest bit_cast(const Source& source) {
+  // Compile time assertion: sizeof(Dest) == sizeof(Source)
+  // A compile error here means your Dest and Source have different sizes.
+  typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
+
+  Dest dest;
+  memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+// The following enum should be used only as a constructor argument to indicate
+// that the variable has static storage class, and that the constructor should
+// do nothing to its state.  It indicates to the reader that it is legal to
+// declare a static instance of the class, provided the constructor is given
+// the base::LINKER_INITIALIZED argument.  Normally, it is unsafe to declare a
+// static variable that has a constructor or a destructor because invocation
+// order is undefined.  However, IF the type can be initialized by filling with
+// zeroes (which the loader does for static variables), AND the destructor also
+// does nothing to the storage, AND there are no virtual methods, then a
+// constructor declared as
+//       explicit MyClass(base::LinkerInitialized x) {}
+// and invoked as
+//       static MyClass my_variable_name(base::LINKER_INITIALIZED);
+namespace base {
+enum LinkerInitialized { LINKER_INITIALIZED };
+}  // base
+
+
+
+
+#endif  // BASE_BASICTYPES_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/convolver.cpp
@@ -0,0 +1,864 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "convolver.h"
+
+#include <algorithm>
+#include "nsAlgorithm.h"
+
+#include "skia/SkTypes.h"
+
+// note: SIMD_SSE2 is not enabled because of bugs, apparently
+
+#if defined(SIMD_SSE2)
+#include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h
+#endif
+
+namespace skia {
+
+namespace {
+
+// Converts the argument to an 8-bit unsigned value by clamping to the range
+// 0-255.
+inline unsigned char ClampTo8(int a) {
+  if (static_cast<unsigned>(a) < 256)
+    return a;  // Avoid the extra check in the common case.
+  if (a < 0)
+    return 0;
+  return 255;
+}
+
+// Stores a list of rows in a circular buffer. The usage is you write into it
+// by calling AdvanceRow. It will keep track of which row in the buffer it
+// should use next, and the total number of rows added.
+class CircularRowBuffer {
+ public:
+  // The number of pixels in each row is given in |source_row_pixel_width|.
+  // The maximum number of rows needed in the buffer is |max_y_filter_size|
+  // (we only need to store enough rows for the biggest filter).
+  //
+  // We use the |first_input_row| to compute the coordinates of all of the
+  // following rows returned by Advance().
+  CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
+                    int first_input_row)
+      : row_byte_width_(dest_row_pixel_width * 4),
+        num_rows_(max_y_filter_size),
+        next_row_(0),
+        next_row_coordinate_(first_input_row) {
+    buffer_.resize(row_byte_width_ * max_y_filter_size);
+    row_addresses_.resize(num_rows_);
+  }
+
+  // Moves to the next row in the buffer, returning a pointer to the beginning
+  // of it.
+  unsigned char* AdvanceRow() {
+    unsigned char* row = &buffer_[next_row_ * row_byte_width_];
+    next_row_coordinate_++;
+
+    // Set the pointer to the next row to use, wrapping around if necessary.
+    next_row_++;
+    if (next_row_ == num_rows_)
+      next_row_ = 0;
+    return row;
+  }
+
+  // Returns a pointer to an "unrolled" array of rows. These rows will start
+  // at the y coordinate placed into |*first_row_index| and will continue in
+  // order for the maximum number of rows in this circular buffer.
+  //
+  // The |first_row_index_| may be negative. This means the circular buffer
+  // starts before the top of the image (it hasn't been filled yet).
+  unsigned char* const* GetRowAddresses(int* first_row_index) {
+    // Example for a 4-element circular buffer holding coords 6-9.
+    //   Row 0   Coord 8
+    //   Row 1   Coord 9
+    //   Row 2   Coord 6  <- next_row_ = 2, next_row_coordinate_ = 10.
+    //   Row 3   Coord 7
+    //
+    // The "next" row is also the first (lowest) coordinate. This computation
+    // may yield a negative value, but that's OK, the math will work out
+    // since the user of this buffer will compute the offset relative
+    // to the first_row_index and the negative rows will never be used.
+    *first_row_index = next_row_coordinate_ - num_rows_;
+
+    int cur_row = next_row_;
+    for (int i = 0; i < num_rows_; i++) {
+      row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
+
+      // Advance to the next row, wrapping if necessary.
+      cur_row++;
+      if (cur_row == num_rows_)
+        cur_row = 0;
+    }
+    return &row_addresses_[0];
+  }
+
+ private:
+  // The buffer storing the rows. They are packed, each one row_byte_width_.
+  std::vector<unsigned char> buffer_;
+
+  // Number of bytes per row in the |buffer_|.
+  int row_byte_width_;
+
+  // The number of rows available in the buffer.
+  int num_rows_;
+
+  // The next row index we should write into. This wraps around as the
+  // circular buffer is used.
+  int next_row_;
+
+  // The y coordinate of the |next_row_|. This is incremented each time a
+  // new row is appended and does not wrap.
+  int next_row_coordinate_;
+
+  // Buffer used by GetRowAddresses().
+  std::vector<unsigned char*> row_addresses_;
+};
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+template<bool has_alpha>
+void ConvolveHorizontally(const unsigned char* src_data,
+                          const ConvolutionFilter1D& filter,
+                          unsigned char* out_row) {
+  // Loop over each pixel on this row in the output image.
+  int num_values = filter.num_values();
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    // Get the filter that determines the current output pixel.
+    int filter_offset, filter_length;
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const unsigned char* row_to_filter = &src_data[filter_offset * 4];
+
+    // Apply the filter to the row to get the destination pixel in |accum|.
+    int accum[4] = {0};
+    for (int filter_x = 0; filter_x < filter_length; filter_x++) {
+      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
+      accum[0] += cur_filter * row_to_filter[filter_x * 4 + 0];
+      accum[1] += cur_filter * row_to_filter[filter_x * 4 + 1];
+      accum[2] += cur_filter * row_to_filter[filter_x * 4 + 2];
+      if (has_alpha)
+        accum[3] += cur_filter * row_to_filter[filter_x * 4 + 3];
+    }
+
+    // Bring this value back in range. All of the filter scaling factors
+    // are in fixed point with kShiftBits bits of fractional part.
+    accum[0] >>= ConvolutionFilter1D::kShiftBits;
+    accum[1] >>= ConvolutionFilter1D::kShiftBits;
+    accum[2] >>= ConvolutionFilter1D::kShiftBits;
+    if (has_alpha)
+      accum[3] >>= ConvolutionFilter1D::kShiftBits;
+
+    // Store the new pixel.
+    out_row[out_x * 4 + 0] = ClampTo8(accum[0]);
+    out_row[out_x * 4 + 1] = ClampTo8(accum[1]);
+    out_row[out_x * 4 + 2] = ClampTo8(accum[2]);
+    if (has_alpha)
+      out_row[out_x * 4 + 3] = ClampTo8(accum[3]);
+  }
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
+                        int filter_length,
+                        unsigned char* const* source_data_rows,
+                        int pixel_width,
+                        unsigned char* out_row) {
+  // We go through each column in the output and do a vertical convolution,
+  // generating one output pixel each time.
+  for (int out_x = 0; out_x < pixel_width; out_x++) {
+    // Compute the number of bytes over in each row that the current column
+    // we're convolving starts at. The pixel will cover the next 4 bytes.
+    int byte_offset = out_x * 4;
+
+    // Apply the filter to one column of pixels.
+    int accum[4] = {0};
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
+      accum[0] += cur_filter * source_data_rows[filter_y][byte_offset + 0];
+      accum[1] += cur_filter * source_data_rows[filter_y][byte_offset + 1];
+      accum[2] += cur_filter * source_data_rows[filter_y][byte_offset + 2];
+      if (has_alpha)
+        accum[3] += cur_filter * source_data_rows[filter_y][byte_offset + 3];
+    }
+
+    // Bring this value back in range. All of the filter scaling factors
+    // are in fixed point with kShiftBits bits of precision.
+    accum[0] >>= ConvolutionFilter1D::kShiftBits;
+    accum[1] >>= ConvolutionFilter1D::kShiftBits;
+    accum[2] >>= ConvolutionFilter1D::kShiftBits;
+    if (has_alpha)
+      accum[3] >>= ConvolutionFilter1D::kShiftBits;
+
+    // Store the new pixel.
+    out_row[byte_offset + 0] = ClampTo8(accum[0]);
+    out_row[byte_offset + 1] = ClampTo8(accum[1]);
+    out_row[byte_offset + 2] = ClampTo8(accum[2]);
+    if (has_alpha) {
+      unsigned char alpha = ClampTo8(accum[3]);
+
+      // Make sure the alpha channel doesn't come out smaller than any of the
+      // color channels. We use premultipled alpha channels, so this should
+      // never happen, but rounding errors will cause this from time to time.
+      // These "impossible" colors will cause overflows (and hence random pixel
+      // values) when the resulting bitmap is drawn to the screen.
+      //
+      // We only need to do this when generating the final output row (here).
+      int max_color_channel = NS_MAX(out_row[byte_offset + 0],
+          NS_MAX(out_row[byte_offset + 1], out_row[byte_offset + 2]));
+      if (alpha < max_color_channel)
+        out_row[byte_offset + 3] = max_color_channel;
+      else
+        out_row[byte_offset + 3] = alpha;
+    } else {
+      // No alpha channel, the image is opaque.
+      out_row[byte_offset + 3] = 0xff;
+    }
+  }
+}
+
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+void ConvolveHorizontally_SSE2(const unsigned char* src_data,
+                               const ConvolutionFilter1D& filter,
+                               unsigned char* out_row) {
+#if defined(SIMD_SSE2)
+  int num_values = filter.num_values();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    __m128i accum = _mm_setzero_si128();
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const __m128i* row_to_filter =
+        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+      __m128i coeff, coeff16;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Load four pixels => unpack the first two pixels to 16 bits =>
+      // multiply with coefficients => accumulate the convolution result.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Duplicate 3rd and 4th coefficients for all channels =>
+      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+      // => accumulate the convolution results.
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      // [16] a3 g3 b3 r3 a2 g2 b2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Advance the pixel and coefficients pointers.
+      row_to_filter += 1;
+      filter_values += 4;
+    }
+
+    // When |filter_length| is not divisible by 4, we need to decimate some of
+    // the filter coefficient that was loaded incorrectly to zero; Other than
+    // that the algorithm is same with above, exceot that the 4th pixel will be
+    // always absent.
+    int r = filter_length&3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8).
+      __m128i coeff, coeff16;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Note: line buffer must be padded to align_up(filter_offset, 16).
+      // We resolve this by use C-version for the last horizontal line.
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    accum = _mm_packs_epi32(accum, zero);
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    accum = _mm_packus_epi16(accum, zero);
+
+    // Store the pixel value of 32 bits.
+    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+    out_row += 4;
+  }
+#endif
+}
+
+// Convolves horizontally along four rows. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
+// refer to that function for detailed comments.
+void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
+                                const ConvolutionFilter1D& filter,
+                                unsigned char* out_row[4]) {
+#if defined(SIMD_SSE2)
+  int num_values = filter.num_values();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // four pixels in a column per iteration.
+    __m128i accum0 = _mm_setzero_si128();
+    __m128i accum1 = _mm_setzero_si128();
+    __m128i accum2 = _mm_setzero_si128();
+    __m128i accum3 = _mm_setzero_si128();
+    int start = (filter_offset<<2);
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+      __m128i coeff, coeff16lo, coeff16hi;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum)                                          \
+      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
+      src16 = _mm_unpacklo_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      src16 = _mm_unpackhi_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t)
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+
+      start += 16;
+      filter_values += 4;
+    }
+
+    int r = filter_length & 3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8);
+      __m128i coeff;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+
+      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      /* c1 c1 c1 c1 c0 c0 c0 c0 */
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum0 = _mm_packs_epi32(accum0, zero);
+    accum0 = _mm_packus_epi16(accum0, zero);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_packs_epi32(accum1, zero);
+    accum1 = _mm_packus_epi16(accum1, zero);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_packs_epi32(accum2, zero);
+    accum2 = _mm_packus_epi16(accum2, zero);
+    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_packs_epi32(accum3, zero);
+    accum3 = _mm_packus_epi16(accum3, zero);
+
+    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+    out_row[0] += 4;
+    out_row[1] += 4;
+    out_row[2] += 4;
+    out_row[3] += 4;
+  }
+#endif
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
+                             int filter_length,
+                             unsigned char* const* source_data_rows,
+                             int pixel_width,
+                             unsigned char* out_row) {
+#if defined(SIMD_SSE2)
+  int width = pixel_width & ~3;
+
+  __m128i zero = _mm_setzero_si128();
+  __m128i accum0, accum1, accum2, accum3, coeff16;
+  const __m128i* src;
+  // Output four pixels per iteration (16 bytes).
+  for (int out_x = 0; out_x < width; out_x += 4) {
+
+    // Accumulated result for each pixel. 32 bits per RGBA channel.
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    accum3 = _mm_setzero_si128();
+
+    // Convolve with one filter coefficient per iteration.
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+      // Duplicate the filter coefficient 8 times.
+      // [16] cj cj cj cj cj cj cj cj
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+      // Load four pixels (16 bytes) together.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][out_x << 2]);
+      __m128i src8 = _mm_loadu_si128(src);
+
+      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+
+      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+      // [32] a3 b3 g3 r3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum3 = _mm_add_epi32(accum3, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, accum3);
+
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+
+    if (has_alpha) {
+      // Compute the max(ri, gi, bi) for each pixel.
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+
+      // Make sure the value of alpha channel is always larger than maximum
+      // value of color channels.
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      // Set value of alpha channels to 0xFF.
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    // Store the convolution result (16 bytes) and advance the pixel pointers.
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+    out_row += 16;
+  }
+
+  // When the width of the output is not divisible by 4, We need to save one
+  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+  if (pixel_width & 3) {
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][width<<2]);
+      __m128i src8 = _mm_loadu_si128(src);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, zero);
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+    if (has_alpha) {
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    for (int out_x = width; out_x < pixel_width; out_x++) {
+      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+      accum0 = _mm_srli_si128(accum0, 4);
+      out_row += 4;
+    }
+  }
+#endif
+}
+
+}  // namespace
+
+// ConvolutionFilter1D ---------------------------------------------------------
+
+ConvolutionFilter1D::ConvolutionFilter1D()
+    : max_filter_(0) {
+}
+
+ConvolutionFilter1D::~ConvolutionFilter1D() {
+}
+
+void ConvolutionFilter1D::AddFilter(int filter_offset,
+                                    const float* filter_values,
+                                    int filter_length) {
+  SkASSERT(filter_length > 0);
+
+  std::vector<Fixed> fixed_values;
+  fixed_values.reserve(filter_length);
+
+  for (int i = 0; i < filter_length; ++i)
+    fixed_values.push_back(FloatToFixed(filter_values[i]));
+
+  AddFilter(filter_offset, &fixed_values[0], filter_length);
+}
+
+void ConvolutionFilter1D::AddFilter(int filter_offset,
+                                    const Fixed* filter_values,
+                                    int filter_length) {
+  // It is common for leading/trailing filter values to be zeros. In such
+  // cases it is beneficial to only store the central factors.
+  // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
+  // a 1080p image this optimization gives a ~10% speed improvement.
+  int first_non_zero = 0;
+  while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
+    first_non_zero++;
+
+  if (first_non_zero < filter_length) {
+    // Here we have at least one non-zero factor.
+    int last_non_zero = filter_length - 1;
+    while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
+      last_non_zero--;
+
+    filter_offset += first_non_zero;
+    filter_length = last_non_zero + 1 - first_non_zero;
+    SkASSERT(filter_length > 0);
+
+    for (int i = first_non_zero; i <= last_non_zero; i++)
+      filter_values_.push_back(filter_values[i]);
+  } else {
+    // Here all the factors were zeroes.
+    filter_length = 0;
+  }
+
+  FilterInstance instance;
+
+  // We pushed filter_length elements onto filter_values_
+  instance.data_location = (static_cast<int>(filter_values_.size()) -
+                            filter_length);
+  instance.offset = filter_offset;
+  instance.length = filter_length;
+  filters_.push_back(instance);
+
+  max_filter_ = NS_MAX(max_filter_, filter_length);
+}
+
+void BGRAConvolve2D(const unsigned char* source_data,
+                    int source_byte_row_stride,
+                    bool source_has_alpha,
+                    const ConvolutionFilter1D& filter_x,
+                    const ConvolutionFilter1D& filter_y,
+                    int output_byte_row_stride,
+                    unsigned char* output,
+                    bool use_sse2) {
+#if !defined(SIMD_SSE2)
+  // Even we have runtime support for SSE2 instructions, since the binary
+  // was not built with SSE2 support, we had to fallback to C version.
+  use_sse2 = false;
+#endif
+
+  int max_y_filter_size = filter_y.max_filter();
+
+  // The next row in the input that we will generate a horizontally
+  // convolved row for. If the filter doesn't start at the beginning of the
+  // image (this is the case when we are only resizing a subset), then we
+  // don't want to generate any output rows before that. Compute the starting
+  // row for convolution as the first pixel for the first vertical filter.
+  int filter_offset, filter_length;
+  const ConvolutionFilter1D::Fixed* filter_values =
+      filter_y.FilterForValue(0, &filter_offset, &filter_length);
+  int next_x_row = filter_offset;
+
+  // We loop over each row in the input doing a horizontal convolution. This
+  // will result in a horizontally convolved image. We write the results into
+  // a circular buffer of convolved rows and do vertical convolution as rows
+  // are available. This prevents us from having to store the entire
+  // intermediate image and helps cache coherency.
+  // We will need four extra rows to allow horizontal convolution could be done
+  // simultaneously. We also padding each row in row buffer to be aligned-up to
+  // 16 bytes.
+  // TODO(jiesun): We do not use aligned load from row buffer in vertical
+  // convolution pass yet. Somehow Windows does not like it.
+  int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
+  int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);
+  CircularRowBuffer row_buffer(row_buffer_width,
+                               row_buffer_height,
+                               filter_offset);
+
+  // Loop over every possible output row, processing just enough horizontal
+  // convolutions to run each subsequent vertical convolution.
+  SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
+  int num_output_rows = filter_y.num_values();
+
+  // We need to check which is the last line to convolve before we advance 4
+  // lines in one iteration.
+  int last_filter_offset, last_filter_length;
+  filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
+                          &last_filter_length);
+
+  for (int out_y = 0; out_y < num_output_rows; out_y++) {
+    filter_values = filter_y.FilterForValue(out_y,
+                                            &filter_offset, &filter_length);
+
+    // Generate output rows until we have enough to run the current filter.
+    if (use_sse2) {
+      while (next_x_row < filter_offset + filter_length) {
+        if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
+          const unsigned char* src[4];
+          unsigned char* out_row[4];
+          for (int i = 0; i < 4; ++i) {
+            src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
+            out_row[i] = row_buffer.AdvanceRow();
+          }
+          ConvolveHorizontally4_SSE2(src, filter_x, out_row);
+          next_x_row += 4;
+        } else {
+          // For the last row, SSE2 load possibly to access data beyond the
+          // image area. therefore we use C version here. 
+          if (next_x_row == last_filter_offset + last_filter_length - 1) {
+            if (source_has_alpha) {
+              ConvolveHorizontally<true>(
+                  &source_data[next_x_row * source_byte_row_stride],
+                  filter_x, row_buffer.AdvanceRow());
+            } else {
+              ConvolveHorizontally<false>(
+                  &source_data[next_x_row * source_byte_row_stride],
+                  filter_x, row_buffer.AdvanceRow());
+            }
+          } else {
+            ConvolveHorizontally_SSE2(
+                &source_data[next_x_row * source_byte_row_stride],
+                filter_x, row_buffer.AdvanceRow());
+          }
+          next_x_row++;
+        }
+      }
+    } else {
+      while (next_x_row < filter_offset + filter_length) {
+        if (source_has_alpha) {
+          ConvolveHorizontally<true>(
+              &source_data[next_x_row * source_byte_row_stride],
+              filter_x, row_buffer.AdvanceRow());
+        } else {
+          ConvolveHorizontally<false>(
+              &source_data[next_x_row * source_byte_row_stride],
+              filter_x, row_buffer.AdvanceRow());
+        }
+        next_x_row++;
+      }
+    }
+
+    // Compute where in the output image this row of final data will go.
+    unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
+
+    // Get the list of rows that the circular buffer has, in order.
+    int first_row_in_circular_buffer;
+    unsigned char* const* rows_to_convolve =
+        row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
+
+    // Now compute the start of the subset of those rows that the filter
+    // needs.
+    unsigned char* const* first_row_for_filter =
+        &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
+
+    if (source_has_alpha) {
+      if (use_sse2) {
+        ConvolveVertically_SSE2<true>(filter_values, filter_length,
+                                      first_row_for_filter,
+                                      filter_x.num_values(), cur_output_row);
+      } else {
+        ConvolveVertically<true>(filter_values, filter_length,
+                                 first_row_for_filter,
+                                 filter_x.num_values(), cur_output_row);
+      }
+    } else {
+      if (use_sse2) {
+        ConvolveVertically_SSE2<false>(filter_values, filter_length,
+                                       first_row_for_filter,
+                                       filter_x.num_values(), cur_output_row);
+      } else {
+        ConvolveVertically<false>(filter_values, filter_length,
+                                 first_row_for_filter,
+                                 filter_x.num_values(), cur_output_row);
+      }
+    }
+  }
+}
+
+}  // namespace skia
new file mode 100644
--- /dev/null
+++ b/gfx/2d/convolver.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SKIA_EXT_CONVOLVER_H_
+#define SKIA_EXT_CONVOLVER_H_
+
+#include <cmath>
+#include <vector>
+
+#include "basictypes.h"
+#include "prtypes.h"
+#include "cpu.h"
+#include "skia/SkTypes.h"
+
+// avoid confusion with Mac OS X's math library (Carbon)
+#if defined(__APPLE__)
+#undef FloatToFixed
+#undef FixedToFloat
+#endif
+
+namespace skia {
+
+// Represents a filter in one dimension. Each output pixel has one entry in this
+// object for the filter values contributing to it. You build up the filter
+// list by calling AddFilter for each output pixel (in order).
+//
+// We do 2-dimensional convolution by first convolving each row by one
+// ConvolutionFilter1D, then convolving each column by another one.
+//
+// Entries are stored in fixed point, shifted left by kShiftBits.
+class ConvolutionFilter1D {
+ public:
+  typedef short Fixed;
+
+  // The number of bits that fixed point values are shifted by.
+  enum { kShiftBits = 14 };
+
+  ConvolutionFilter1D();
+  ~ConvolutionFilter1D();
+
+  // Convert between floating point and our fixed point representation.
+  static Fixed FloatToFixed(float f) {
+    return static_cast<Fixed>(f * (1 << kShiftBits));
+  }
+  static unsigned char FixedToChar(Fixed x) {
+    return static_cast<unsigned char>(x >> kShiftBits);
+  }
+  static float FixedToFloat(Fixed x) {
+    // The cast relies on Fixed being a short, implying that on
+    // the platforms we care about all (16) bits will fit into
+    // the mantissa of a (32-bit) float.
+    COMPILE_ASSERT(sizeof(Fixed) == 2, fixed_type_should_fit_in_float_mantissa);
+    float raw = static_cast<float>(x);
+    return ldexpf(raw, -kShiftBits);
+  }
+
+  // Returns the maximum pixel span of a filter.
+  int max_filter() const { return max_filter_; }
+
+  // Returns the number of filters in this filter. This is the dimension of the
+  // output image.
+  int num_values() const { return static_cast<int>(filters_.size()); }
+
+  // Appends the given list of scaling values for generating a given output
+  // pixel. |filter_offset| is the distance from the edge of the image to where
+  // the scaling factors start. The scaling factors apply to the source pixels
+  // starting from this position, and going for the next |filter_length| pixels.
+  //
+  // You will probably want to make sure your input is normalized (that is,
+  // all entries in |filter_values| sub to one) to prevent affecting the overall
+  // brighness of the image.
+  //
+  // The filter_length must be > 0.
+  //
+  // This version will automatically convert your input to fixed point.
+  void AddFilter(int filter_offset,
+                        const float* filter_values,
+                        int filter_length);
+
+  // Same as the above version, but the input is already fixed point.
+  void AddFilter(int filter_offset,
+                 const Fixed* filter_values,
+                 int filter_length);
+
+  // Retrieves a filter for the given |value_offset|, a position in the output
+  // image in the direction we're convolving. The offset and length of the
+  // filter values are put into the corresponding out arguments (see AddFilter
+  // above for what these mean), and a pointer to the first scaling factor is
+  // returned. There will be |filter_length| values in this array.
+  inline const Fixed* FilterForValue(int value_offset,
+                                     int* filter_offset,
+                                     int* filter_length) const {
+    const FilterInstance& filter = filters_[value_offset];
+    *filter_offset = filter.offset;
+    *filter_length = filter.length;
+    if (filter.length == 0) {
+      return NULL;
+    }
+    return &filter_values_[filter.data_location];
+  }
+
+
+  inline void PaddingForSIMD(int padding_count) {
+    // Padding |padding_count| of more dummy coefficients after the coefficients
+    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
+    // together to access invalid memory areas. We are not trying to align the
+    // coefficients right now due to the opaqueness of <vector> implementation.
+    // This has to be done after all |AddFilter| calls.
+    for (int i = 0; i < padding_count; ++i)
+      filter_values_.push_back(static_cast<Fixed>(0));
+  }
+
+ private:
+  struct FilterInstance {
+    // Offset within filter_values for this instance of the filter.
+    int data_location;
+
+    // Distance from the left of the filter to the center. IN PIXELS
+    int offset;
+
+    // Number of values in this filter instance.
+    int length;
+  };
+
+  // Stores the information for each filter added to this class.
+  std::vector<FilterInstance> filters_;
+
+  // We store all the filter values in this flat list, indexed by
+  // |FilterInstance.data_location| to avoid the mallocs required for storing
+  // each one separately.
+  std::vector<Fixed> filter_values_;
+
+  // The maximum size of any filter we've added.
+  int max_filter_;
+};
+
+// Does a two-dimensional convolution on the given source image.
+//
+// It is assumed the source pixel offsets referenced in the input filters
+// reference only valid pixels, so the source image size is not required. Each
+// row of the source image starts |source_byte_row_stride| after the previous
+// one (this allows you to have rows with some padding at the end).
+//
+// The result will be put into the given output buffer. The destination image
+// size will be xfilter.num_values() * yfilter.num_values() pixels. It will be
+// in rows of exactly xfilter.num_values() * 4 bytes.
+//
+// |source_has_alpha| is a hint that allows us to avoid doing computations on
+// the alpha channel if the image is opaque. If you don't know, set this to
+// true and it will work properly, but setting this to false will be a few
+// percent faster if you know the image is opaque.
+//
+// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
+// (this is ARGB when loaded into 32-bit words on a little-endian machine).
+void BGRAConvolve2D(const unsigned char* source_data,
+                    int source_byte_row_stride,
+                    bool source_has_alpha,
+                    const ConvolutionFilter1D& xfilter,
+                    const ConvolutionFilter1D& yfilter,
+                    int output_byte_row_stride,
+                    unsigned char* output,
+                    bool use_sse2);
+}  // namespace skia
+
+#endif  // SKIA_EXT_CONVOLVER_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/cpu.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_CPU_H_
+#define BASE_CPU_H_
+
+#include <string>
+
+namespace base {
+
+// Query information about the processor.
+class CPU {
+ public:
+  // Constructor
+  CPU();
+
+  // Accessors for CPU information.
+  const std::string& vendor_name() const { return cpu_vendor_; }
+  int stepping() const { return stepping_; }
+  int model() const { return model_; }
+  int family() const { return family_; }
+  int type() const { return type_; }
+  int extended_model() const { return ext_model_; }
+  int extended_family() const { return ext_family_; }
+
+ private:
+  // Query the processor for CPUID information.
+  void Initialize();
+
+  int type_;  // process type
+  int family_;  // family of the processor
+  int model_;  // model of processor
+  int stepping_;  // processor revision number
+  int ext_model_;
+  int ext_family_;
+  std::string cpu_vendor_;
+};
+
+}  // namespace base
+
+#endif  // BASE_CPU_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/image_operations.cpp
@@ -0,0 +1,536 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "basictypes.h"
+
+#define _USE_MATH_DEFINES
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "image_operations.h"
+
+#include "nsAlgorithm.h"
+#include "stack_container.h"
+#include "convolver.h"
+#include "skia/SkColorPriv.h"
+#include "skia/SkBitmap.h"
+#include "skia/SkRect.h"
+#include "skia/SkFontHost.h"
+
+namespace skia {
+
+namespace {
+
+// Returns the ceiling/floor as an integer.
+inline int CeilInt(float val) {
+  return static_cast<int>(ceil(val));
+}
+inline int FloorInt(float val) {
+  return static_cast<int>(floor(val));
+}
+
+// Filter function computation -------------------------------------------------
+
+// Evaluates the box filter, which goes from -0.5 to +0.5.
+float EvalBox(float x) {
+  return (x >= -0.5f && x < 0.5f) ? 1.0f : 0.0f;
+}
+
+// Evaluates the Lanczos filter of the given filter size window for the given
+// position.
+//
+// |filter_size| is the width of the filter (the "window"), outside of which
+// the value of the function is 0. Inside of the window, the value is the
+// normalized sinc function:
+//   lanczos(x) = sinc(x) * sinc(x / filter_size);
+// where
+//   sinc(x) = sin(pi*x) / (pi*x);
+float EvalLanczos(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -std::numeric_limits<float>::epsilon() &&
+      x < std::numeric_limits<float>::epsilon())
+    return 1.0f;  // Special case the discontinuity at the origin.
+  float xpi = x * static_cast<float>(M_PI);
+  return (sin(xpi) / xpi) *  // sinc(x)
+          sin(xpi / filter_size) / (xpi / filter_size);  // sinc(x/filter_size)
+}
+
+// Evaluates the Hamming filter of the given filter size window for the given
+// position.
+//
+// The filter covers [-filter_size, +filter_size]. Outside of this window
+// the value of the function is 0. Inside of the window, the value is sinus
+// cardinal multiplied by a recentered Hamming function. The traditional
+// Hamming formula for a window of size N and n ranging in [0, N-1] is:
+//   hamming(n) = 0.54 - 0.46 * cos(2 * pi * n / (N-1)))
+// In our case we want the function centered for x == 0 and at its minimum
+// on both ends of the window (x == +/- filter_size), hence the adjusted
+// formula:
+//   hamming(x) = (0.54 -
+//                 0.46 * cos(2 * pi * (x - filter_size)/ (2 * filter_size)))
+//              = 0.54 - 0.46 * cos(pi * x / filter_size - pi)
+//              = 0.54 + 0.46 * cos(pi * x / filter_size)
+float EvalHamming(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -std::numeric_limits<float>::epsilon() &&
+      x < std::numeric_limits<float>::epsilon())
+    return 1.0f;  // Special case the sinc discontinuity at the origin.
+  const float xpi = x * static_cast<float>(M_PI);
+
+  return ((sin(xpi) / xpi) *  // sinc(x)
+          (0.54f + 0.46f * cos(xpi / filter_size)));  // hamming(x)
+}
+
+// ResizeFilter ----------------------------------------------------------------
+
+// Encapsulates computation and storage of the filters required for one complete
+// resize operation.
+class ResizeFilter {
+ public:
+  ResizeFilter(ImageOperations::ResizeMethod method,
+               int src_full_width, int src_full_height,
+               int dest_width, int dest_height,
+               const SkIRect& dest_subset);
+
+  // Returns the filled filter values.
+  const ConvolutionFilter1D& x_filter() { return x_filter_; }
+  const ConvolutionFilter1D& y_filter() { return y_filter_; }
+
+ private:
+  // Returns the number of pixels that the filer spans, in filter space (the
+  // destination image).
+  float GetFilterSupport(float scale) {
+    switch (method_) {
+      case ImageOperations::RESIZE_BOX:
+        // The box filter just scales with the image scaling.
+        return 0.5f;  // Only want one side of the filter = /2.
+      case ImageOperations::RESIZE_HAMMING1:
+        // The Hamming filter takes as much space in the source image in
+        // each direction as the size of the window = 1 for Hamming1.
+        return 1.0f;
+      case ImageOperations::RESIZE_LANCZOS2:
+        // The Lanczos filter takes as much space in the source image in
+        // each direction as the size of the window = 2 for Lanczos2.
+        return 2.0f;
+      case ImageOperations::RESIZE_LANCZOS3:
+        // The Lanczos filter takes as much space in the source image in
+        // each direction as the size of the window = 3 for Lanczos3.
+        return 3.0f;
+      default:
+        return 1.0f;
+    }
+  }
+
+  // Computes one set of filters either horizontally or vertically. The caller
+  // will specify the "min" and "max" rather than the bottom/top and
+  // right/bottom so that the same code can be re-used in each dimension.
+  //
+  // |src_depend_lo| and |src_depend_size| gives the range for the source
+  // depend rectangle (horizontally or vertically at the caller's discretion
+  // -- see above for what this means).
+  //
+  // Likewise, the range of destination values to compute and the scale factor
+  // for the transform is also specified.
+  void ComputeFilters(int src_size,
+                      int dest_subset_lo, int dest_subset_size,
+                      float scale, float src_support,
+                      ConvolutionFilter1D* output);
+
+  // Computes the filter value given the coordinate in filter space.
+  inline float ComputeFilter(float pos) {
+    switch (method_) {
+      case ImageOperations::RESIZE_BOX:
+        return EvalBox(pos);
+      case ImageOperations::RESIZE_HAMMING1:
+        return EvalHamming(1, pos);
+      case ImageOperations::RESIZE_LANCZOS2:
+        return EvalLanczos(2, pos);
+      case ImageOperations::RESIZE_LANCZOS3:
+        return EvalLanczos(3, pos);
+      default:
+        return 0;
+    }
+  }
+
+  ImageOperations::ResizeMethod method_;
+
+  // Size of the filter support on one side only in the destination space.
+  // See GetFilterSupport.
+  float x_filter_support_;
+  float y_filter_support_;
+
+  // Subset of scaled destination bitmap to compute.
+  SkIRect out_bounds_;
+
+  ConvolutionFilter1D x_filter_;
+  ConvolutionFilter1D y_filter_;
+
+  DISALLOW_COPY_AND_ASSIGN(ResizeFilter);
+};
+
+ResizeFilter::ResizeFilter(ImageOperations::ResizeMethod method,
+                           int src_full_width, int src_full_height,
+                           int dest_width, int dest_height,
+                           const SkIRect& dest_subset)
+    : method_(method),
+      out_bounds_(dest_subset) {
+  // method_ will only ever refer to an "algorithm method".
+  SkASSERT((ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+           (method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD));
+
+  float scale_x = static_cast<float>(dest_width) /
+                  static_cast<float>(src_full_width);
+  float scale_y = static_cast<float>(dest_height) /
+                  static_cast<float>(src_full_height);
+
+  x_filter_support_ = GetFilterSupport(scale_x);
+  y_filter_support_ = GetFilterSupport(scale_y);
+
+  // Support of the filter in source space.
+  float src_x_support = x_filter_support_ / scale_x;
+  float src_y_support = y_filter_support_ / scale_y;
+
+  ComputeFilters(src_full_width, dest_subset.fLeft, dest_subset.width(),
+                 scale_x, src_x_support, &x_filter_);
+  ComputeFilters(src_full_height, dest_subset.fTop, dest_subset.height(),
+                 scale_y, src_y_support, &y_filter_);
+}
+
+// TODO(egouriou): Take advantage of periods in the convolution.
+// Practical resizing filters are periodic outside of the border area.
+// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
+// source become p pixels in the destination) will have a period of p.
+// A nice consequence is a period of 1 when downscaling by an integral
+// factor. Downscaling from typical display resolutions is also bound
+// to produce interesting periods as those are chosen to have multiple
+// small factors.
+// Small periods reduce computational load and improve cache usage if
+// the coefficients can be shared. For periods of 1 we can consider
+// loading the factors only once outside the borders.
+void ResizeFilter::ComputeFilters(int src_size,
+                                  int dest_subset_lo, int dest_subset_size,
+                                  float scale, float src_support,
+                                  ConvolutionFilter1D* output) {
+  int dest_subset_hi = dest_subset_lo + dest_subset_size;  // [lo, hi)
+
+  // When we're doing a magnification, the scale will be larger than one. This
+  // means the destination pixels are much smaller than the source pixels, and
+  // that the range covered by the filter won't necessarily cover any source
+  // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+  // some computations.
+  float clamped_scale = NS_MIN(1.0f, scale);
+
+  // Speed up the divisions below by turning them into multiplies.
+  float inv_scale = 1.0f / scale;
+
+  StackVector<float, 64> filter_values;
+  StackVector<int16_t, 64> fixed_filter_values;
+
+  // Loop over all pixels in the output range. We will generate one set of
+  // filter values for each one. Those values will tell us how to blend the
+  // source pixels to compute the destination pixel.
+  for (int dest_subset_i = dest_subset_lo; dest_subset_i < dest_subset_hi;
+       dest_subset_i++) {
+    // Reset the arrays. We don't declare them inside so they can re-use the
+    // same malloc-ed buffer.
+    filter_values->clear();
+    fixed_filter_values->clear();
+
+    // This is the pixel in the source directly under the pixel in the dest.
+    // Note that we base computations on the "center" of the pixels. To see
+    // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
+    // downscale should "cover" the pixels around the pixel with *its center*
+    // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
+    // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
+    // TODO(evannier): this code is therefore incorrect and should read:
+    // float src_pixel = (static_cast<float>(dest_subset_i) + 0.5f) * inv_scale;
+    // I leave it incorrect, because changing it would require modifying
+    // the results for the webkit test, which I will do in a subsequent checkin.
+    float src_pixel = dest_subset_i * inv_scale;
+
+    // Compute the (inclusive) range of source pixels the filter covers.
+    int src_begin = NS_MAX(0, FloorInt(src_pixel - src_support));
+    int src_end = NS_MIN(src_size - 1, CeilInt(src_pixel + src_support));
+
+    // Compute the unnormalized filter value at each location of the source
+    // it covers.
+    float filter_sum = 0.0f;  // Sub of the filter values for normalizing.
+    for (int cur_filter_pixel = src_begin; cur_filter_pixel <= src_end;
+         cur_filter_pixel++) {
+      // Distance from the center of the filter, this is the filter coordinate
+      // in source space. We also need to consider the center of the pixel
+      // when comparing distance against 'src_pixel'. In the 5x downscale
+      // example used above the distance from the center of the filter to
+      // the pixel with coordinates (2, 2) should be 0, because its center
+      // is at (2.5, 2.5).
+      // TODO(evannier): as above (in regards to the 0.5 pixel error),
+      // this code is incorrect, but is left it for the same reasons.
+      // float src_filter_dist =
+      //     ((static_cast<float>(cur_filter_pixel) + 0.5f) - src_pixel);
+      float src_filter_dist = cur_filter_pixel - src_pixel;
+
+      // Since the filter really exists in dest space, map it there.
+      float dest_filter_dist = src_filter_dist * clamped_scale;
+
+      // Compute the filter value at that location.
+      float filter_value = ComputeFilter(dest_filter_dist);
+      filter_values->push_back(filter_value);
+
+      filter_sum += filter_value;
+    }
+
+    // The filter must be normalized so that we don't affect the brightness of
+    // the image. Convert to normalized fixed point.
+    int16_t fixed_sum = 0;
+    for (size_t i = 0; i < filter_values->size(); i++) {
+      int16_t cur_fixed = output->FloatToFixed(filter_values[i] / filter_sum);
+      fixed_sum += cur_fixed;
+      fixed_filter_values->push_back(cur_fixed);
+    }
+
+    // The conversion to fixed point will leave some rounding errors, which
+    // we add back in to avoid affecting the brightness of the image. We
+    // arbitrarily add this to the center of the filter array (this won't always
+    // be the center of the filter function since it could get clipped on the
+    // edges, but it doesn't matter enough to worry about that case).
+    int16_t leftovers = output->FloatToFixed(1.0f) - fixed_sum;
+    fixed_filter_values[fixed_filter_values->size() / 2] += leftovers;
+
+    // Now it's ready to go.
+    output->AddFilter(src_begin, &fixed_filter_values[0],
+                      static_cast<int>(fixed_filter_values->size()));
+  }
+
+  output->PaddingForSIMD(8);
+}
+
+ImageOperations::ResizeMethod ResizeMethodToAlgorithmMethod(
+    ImageOperations::ResizeMethod method) {
+  // Convert any "Quality Method" into an "Algorithm Method"
+  if (method >= ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD &&
+      method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD) {
+    return method;
+  }
+  // The call to ImageOperationsGtv::Resize() above took care of
+  // GPU-acceleration in the cases where it is possible. So now we just
+  // pick the appropriate software method for each resize quality.
+  switch (method) {
+    // Users of RESIZE_GOOD are willing to trade a lot of quality to
+    // get speed, allowing the use of linear resampling to get hardware
+    // acceleration (SRB). Hence any of our "good" software filters
+    // will be acceptable, and we use the fastest one, Hamming-1.
+    case ImageOperations::RESIZE_GOOD:
+      // Users of RESIZE_BETTER are willing to trade some quality in order
+      // to improve performance, but are guaranteed not to devolve to a linear
+      // resampling. In visual tests we see that Hamming-1 is not as good as
+      // Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
+      // about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
+      // an acceptable trade-off between quality and speed.
+    case ImageOperations::RESIZE_BETTER:
+      return ImageOperations::RESIZE_HAMMING1;
+    default:
+      return ImageOperations::RESIZE_LANCZOS3;
+  }
+}
+
+}  // namespace
+
+// Resize ----------------------------------------------------------------------
+
+// static
+SkBitmap ImageOperations::Resize(const SkBitmap& source,
+                                 ResizeMethod method,
+                                 int dest_width, int dest_height,
+                                 const SkIRect& dest_subset,
+                                 void* dest_pixels /* = nullptr */) {
+  if (method == ImageOperations::RESIZE_SUBPIXEL)
+    return ResizeSubpixel(source, dest_width, dest_height, dest_subset);
+  else
+    return ResizeBasic(source, method, dest_width, dest_height, dest_subset,
+                       dest_pixels);
+}
+
+// static
+SkBitmap ImageOperations::ResizeSubpixel(const SkBitmap& source,
+                                         int dest_width, int dest_height,
+                                         const SkIRect& dest_subset) {
+  // Currently only works on Linux/BSD because these are the only platforms
+  // where SkFontHost::GetSubpixelOrder is defined.
+#if defined(XP_UNIX)
+  // Understand the display.
+  const SkFontHost::LCDOrder order = SkFontHost::GetSubpixelOrder();
+  const SkFontHost::LCDOrientation orientation =
+      SkFontHost::GetSubpixelOrientation();
+
+  // Decide on which dimension, if any, to deploy subpixel rendering.
+  int w = 1;
+  int h = 1;
+  switch (orientation) {
+    case SkFontHost::kHorizontal_LCDOrientation:
+      w = dest_width < source.width() ? 3 : 1;
+      break;
+    case SkFontHost::kVertical_LCDOrientation:
+      h = dest_height < source.height() ? 3 : 1;
+      break;
+  }
+
+  // Resize the image.
+  const int width = dest_width * w;
+  const int height = dest_height * h;
+  SkIRect subset = { dest_subset.fLeft, dest_subset.fTop,
+                     dest_subset.fLeft + dest_subset.width() * w,
+                     dest_subset.fTop + dest_subset.height() * h };
+  SkBitmap img = ResizeBasic(source, ImageOperations::RESIZE_LANCZOS3, width,
+                             height, subset);
+  const int row_words = img.rowBytes() / 4;
+  if (w == 1 && h == 1)
+    return img;
+
+  // Render into subpixels.
+  SkBitmap result;
+  result.setConfig(SkBitmap::kARGB_8888_Config, dest_subset.width(),
+                   dest_subset.height());
+  result.allocPixels();
+  if (!result.readyToDraw())
+    return img;
+
+  SkAutoLockPixels locker(img);
+  if (!img.readyToDraw())
+    return img;
+
+  uint32_t* src_row = img.getAddr32(0, 0);
+  uint32_t* dst_row = result.getAddr32(0, 0);
+  for (int y = 0; y < dest_subset.height(); y++) {
+    uint32_t* src = src_row;
+    uint32_t* dst = dst_row;
+    for (int x = 0; x < dest_subset.width(); x++, src += w, dst++) {
+      uint8_t r = 0, g = 0, b = 0, a = 0;
+      switch (order) {
+        case SkFontHost::kRGB_LCDOrder:
+          switch (orientation) {
+            case SkFontHost::kHorizontal_LCDOrientation:
+              r = SkGetPackedR32(src[0]);
+              g = SkGetPackedG32(src[1]);
+              b = SkGetPackedB32(src[2]);
+              a = SkGetPackedA32(src[1]);
+              break;
+            case SkFontHost::kVertical_LCDOrientation:
+              r = SkGetPackedR32(src[0 * row_words]);
+              g = SkGetPackedG32(src[1 * row_words]);
+              b = SkGetPackedB32(src[2 * row_words]);
+              a = SkGetPackedA32(src[1 * row_words]);
+              break;
+          }
+          break;
+        case SkFontHost::kBGR_LCDOrder:
+          switch (orientation) {
+            case SkFontHost::kHorizontal_LCDOrientation:
+              b = SkGetPackedB32(src[0]);
+              g = SkGetPackedG32(src[1]);
+              r = SkGetPackedR32(src[2]);
+              a = SkGetPackedA32(src[1]);
+              break;
+            case SkFontHost::kVertical_LCDOrientation:
+              b = SkGetPackedB32(src[0 * row_words]);
+              g = SkGetPackedG32(src[1 * row_words]);
+              r = SkGetPackedR32(src[2 * row_words]);
+              a = SkGetPackedA32(src[1 * row_words]);
+              break;
+          }
+          break;
+        case SkFontHost::kNONE_LCDOrder:
+          break;
+      }
+      // Premultiplied alpha is very fragile.
+      a = a > r ? a : r;
+      a = a > g ? a : g;
+      a = a > b ? a : b;
+      *dst = SkPackARGB32(a, r, g, b);
+    }
+    src_row += h * row_words;
+    dst_row += result.rowBytes() / 4;
+  }
+  result.setIsOpaque(img.isOpaque());
+  return result;
+#else
+  return SkBitmap();
+#endif  // OS_POSIX && !OS_MACOSX && !defined(OS_ANDROID)
+}
+
+// static
+SkBitmap ImageOperations::ResizeBasic(const SkBitmap& source,
+                                      ResizeMethod method,
+                                      int dest_width, int dest_height,
+                                      const SkIRect& dest_subset,
+                                      void* dest_pixels /* = nullptr */) {
+  // Ensure that the ResizeMethod enumeration is sound.
+  SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
+            (method <= RESIZE_LAST_QUALITY_METHOD)) ||
+           ((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+            (method <= RESIZE_LAST_ALGORITHM_METHOD)));
+
+  // If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
+  // return empty.
+  if (source.width() < 1 || source.height() < 1 ||
+      dest_width < 1 || dest_height < 1)
+    return SkBitmap();
+
+  method = ResizeMethodToAlgorithmMethod(method);
+  // Check that we deal with an "algorithm methods" from this point onward.
+  SkASSERT((ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+           (method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD));
+
+  SkAutoLockPixels locker(source);
+  if (!source.readyToDraw())
+      return SkBitmap();
+
+  ResizeFilter filter(method, source.width(), source.height(),
+                      dest_width, dest_height, dest_subset);
+
+  // Get a source bitmap encompassing this touched area. We construct the
+  // offsets and row strides such that it looks like a new bitmap, while
+  // referring to the old data.
+  const uint8_t* source_subset =
+      reinterpret_cast<const uint8_t*>(source.getPixels());
+
+  // Convolve into the result.
+  SkBitmap result;
+  result.setConfig(SkBitmap::kARGB_8888_Config,
+                   dest_subset.width(), dest_subset.height());
+
+  if (dest_pixels) {
+    result.setPixels(dest_pixels);
+  } else {
+    result.allocPixels();
+  }
+
+  if (!result.readyToDraw())
+    return SkBitmap();
+
+  BGRAConvolve2D(source_subset, static_cast<int>(source.rowBytes()),
+                 !source.isOpaque(), filter.x_filter(), filter.y_filter(),
+                 static_cast<int>(result.rowBytes()),
+                 static_cast<unsigned char*>(result.getPixels()),
+                 /* sse = */ false);
+
+  // Preserve the "opaque" flag for use as an optimization later.
+  result.setIsOpaque(source.isOpaque());
+
+  return result;
+}
+
+// static
+SkBitmap ImageOperations::Resize(const SkBitmap& source,
+                                 ResizeMethod method,
+                                 int dest_width, int dest_height,
+                                 void* dest_pixels /* = nullptr */) {
+  SkIRect dest_subset = { 0, 0, dest_width, dest_height };
+  return Resize(source, method, dest_width, dest_height, dest_subset,
+                dest_pixels);
+}
+
+}  // namespace skia
new file mode 100644
--- /dev/null
+++ b/gfx/2d/image_operations.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SKIA_EXT_IMAGE_OPERATIONS_H_
+#define SKIA_EXT_IMAGE_OPERATIONS_H_
+
+#include "skia/SkTypes.h"
+#include "Types.h"
+
+class SkBitmap;
+struct SkIRect;
+
+namespace skia {
+
+class ImageOperations {
+ public:
+  enum ResizeMethod {
+    //
+    // Quality Methods
+    //
+    // Those enumeration values express a desired quality/speed tradeoff.
+    // They are translated into an algorithm-specific method that depends
+    // on the capabilities (CPU, GPU) of the underlying platform.
+    // It is possible for all three methods to be mapped to the same
+    // algorithm on a given platform.
+
+    // Good quality resizing. Fastest resizing with acceptable visual quality.
+    // This is typically intended for use during interactive layouts
+    // where slower platforms may want to trade image quality for large
+    // increase in resizing performance.
+    //
+    // For example the resizing implementation may devolve to linear
+    // filtering if this enables GPU acceleration to be used.
+    //
+    // Note that the underlying resizing method may be determined
+    // on the fly based on the parameters for a given resize call.
+    // For example an implementation using a GPU-based linear filter
+    // in the common case may still use a higher-quality software-based
+    // filter in cases where using the GPU would actually be slower - due
+    // to too much latency - or impossible - due to image format or size
+    // constraints.
+    RESIZE_GOOD,
+
+    // Medium quality resizing. Close to high quality resizing (better
+    // than linear interpolation) with potentially some quality being
+    // traded-off for additional speed compared to RESIZE_BEST.
+    //
+    // This is intended, for example, for generation of large thumbnails
+    // (hundreds of pixels in each dimension) from large sources, where
+    // a linear filter would produce too many artifacts but where
+    // a RESIZE_HIGH might be too costly time-wise.
+    RESIZE_BETTER,
+
+    // High quality resizing. The algorithm is picked to favor image quality.
+    RESIZE_BEST,
+
+    //
+    // Algorithm-specific enumerations
+    //
+
+    // Box filter. This is a weighted average of all of the pixels touching
+    // the destination pixel. For enlargement, this is nearest neighbor.
+    //
+    // You probably don't want this, it is here for testing since it is easy to
+    // compute. Use RESIZE_LANCZOS3 instead.
+    RESIZE_BOX,
+
+    // 1-cycle Hamming filter. This is tall is the middle and falls off towards
+    // the window edges but without going to 0. This is about 40% faster than
+    // a 2-cycle Lanczos.
+    RESIZE_HAMMING1,
+
+    // 2-cycle Lanczos filter. This is tall in the middle, goes negative on
+    // each side, then returns to zero. Does not provide as good a frequency
+    // response as a 3-cycle Lanczos but is roughly 30% faster.
+    RESIZE_LANCZOS2,
+
+    // 3-cycle Lanczos filter. This is tall in the middle, goes negative on
+    // each side, then oscillates 2 more times. It gives nice sharp edges.
+    RESIZE_LANCZOS3,
+
+    // Lanczos filter + subpixel interpolation. If subpixel rendering is not
+    // appropriate we automatically fall back to Lanczos.
+    RESIZE_SUBPIXEL,
+
+    // enum aliases for first and last methods by algorithm or by quality.
+    RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
+    RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
+    RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
+    RESIZE_LAST_ALGORITHM_METHOD = RESIZE_SUBPIXEL,
+  };
+
+  // Resizes the given source bitmap using the specified resize method, so that
+  // the entire image is (dest_size) big. The dest_subset is the rectangle in
+  // this destination image that should actually be returned.
+  //
+  // The output image will be (dest_subset.width(), dest_subset.height()). This
+  // will save work if you do not need the entire bitmap.
+  //
+  // The destination subset must be smaller than the destination image.
+  static SkBitmap Resize(const SkBitmap& source,
+                         ResizeMethod method,
+                         int dest_width, int dest_height,
+                         const SkIRect& dest_subset,
+                         void* dest_pixels = nullptr);
+
+  // Alternate version for resizing and returning the entire bitmap rather than
+  // a subset.
+  static SkBitmap Resize(const SkBitmap& source,
+                         ResizeMethod method,
+                         int dest_width, int dest_height,
+                         void* dest_pixels = nullptr);
+
+ private:
+  ImageOperations();  // Class for scoping only.
+
+  // Supports all methods except RESIZE_SUBPIXEL.
+  static SkBitmap ResizeBasic(const SkBitmap& source,
+                              ResizeMethod method,
+                              int dest_width, int dest_height,
+                              const SkIRect& dest_subset,
+                              void* dest_pixels = nullptr);
+
+  // Subpixel renderer.
+  static SkBitmap ResizeSubpixel(const SkBitmap& source,
+                                 int dest_width, int dest_height,
+                                 const SkIRect& dest_subset);
+};
+
+}  // namespace skia
+
+#endif  // SKIA_EXT_IMAGE_OPERATIONS_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/port.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_PORT_H_
+#define BASE_PORT_H_
+
+#include <stdarg.h>
+#include "build/build_config.h"
+
+#ifdef COMPILER_MSVC
+#define GG_LONGLONG(x) x##I64
+#define GG_ULONGLONG(x) x##UI64
+#else
+#define GG_LONGLONG(x) x##LL
+#define GG_ULONGLONG(x) x##ULL
+#endif
+
+// Per C99 7.8.14, define __STDC_CONSTANT_MACROS before including <stdint.h>
+// to get the INTn_C and UINTn_C macros for integer constants.  It's difficult
+// to guarantee any specific ordering of header includes, so it's difficult to
+// guarantee that the INTn_C macros can be defined by including <stdint.h> at
+// any specific point.  Provide GG_INTn_C macros instead.
+
+#define GG_INT8_C(x)    (x)
+#define GG_INT16_C(x)   (x)
+#define GG_INT32_C(x)   (x)
+#define GG_INT64_C(x)   GG_LONGLONG(x)
+
+#define GG_UINT8_C(x)   (x ## U)
+#define GG_UINT16_C(x)  (x ## U)
+#define GG_UINT32_C(x)  (x ## U)
+#define GG_UINT64_C(x)  GG_ULONGLONG(x)
+
+namespace base {
+
+// It's possible for functions that use a va_list, such as StringPrintf, to
+// invalidate the data in it upon use.  The fix is to make a copy of the
+// structure before using it and use that copy instead.  va_copy is provided
+// for this purpose.  MSVC does not provide va_copy, so define an
+// implementation here.  It is not guaranteed that assignment is a copy, so the
+// StringUtil.VariableArgsFunc unit test tests this capability.
+
+// The C standard says that va_copy is a "macro", not a function.  Trying to 
+// use va_list as ref args to a function, as above, breaks some machines.
+#  if defined(COMPILER_GCC)
+#    define base_va_copy(_a, _b) ::va_copy(_a, _b)
+#  elif defined(COMPILER_MSVC)
+#    define base_va_copy(_a, _b) (_a = _b)
+#  else
+#    error No va_copy for your compiler
+#  endif
+
+}  // namespace base
+
+// Define an OS-neutral wrapper for shared library entry points
+#if defined(OS_WIN)
+#define API_CALL __stdcall
+#elif defined(OS_LINUX) || defined(OS_MACOSX)
+#define API_CALL
+#endif
+
+#endif  // BASE_PORT_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/stack_container.h
@@ -0,0 +1,253 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STACK_CONTAINER_H_
+#define BASE_STACK_CONTAINER_H_
+
+#include <string>
+#include <vector>
+
+#include "basictypes.h"
+
+// This allocator can be used with STL containers to provide a stack buffer
+// from which to allocate memory and overflows onto the heap. This stack buffer
+// would be allocated on the stack and allows us to avoid heap operations in
+// some situations.
+//
+// STL likes to make copies of allocators, so the allocator itself can't hold
+// the data. Instead, we make the creator responsible for creating a
+// StackAllocator::Source which contains the data. Copying the allocator
+// merely copies the pointer to this shared source, so all allocators created
+// based on our allocator will share the same stack buffer.
+//
+// This stack buffer implementation is very simple. The first allocation that
+// fits in the stack buffer will use the stack buffer. Any subsequent
+// allocations will not use the stack buffer, even if there is unused room.
+// This makes it appropriate for array-like containers, but the caller should
+// be sure to reserve() in the container up to the stack buffer size. Otherwise
+// the container will allocate a small array which will "use up" the stack
+// buffer.
+template<typename T, size_t stack_capacity>
+class StackAllocator : public std::allocator<T> {
+ public:
+  typedef typename std::allocator<T>::pointer pointer;
+  typedef typename std::allocator<T>::size_type size_type;
+
+  // Backing store for the allocator. The container owner is responsible for
+  // maintaining this for as long as any containers using this allocator are
+  // live.
+  struct Source {
+    Source() : used_stack_buffer_(false) {
+    }
+
+    // Casts the buffer in its right type.
+    T* stack_buffer() { return reinterpret_cast<T*>(stack_buffer_); }
+    const T* stack_buffer() const {
+      return reinterpret_cast<const T*>(stack_buffer_);
+    }
+
+    //
+    // IMPORTANT: Take care to ensure that stack_buffer_ is aligned
+    // since it is used to mimic an array of T.
+    // Be careful while declaring any unaligned types (like bool)
+    // before stack_buffer_.
+    //
+
+    // The buffer itself. It is not of type T because we don't want the
+    // constructors and destructors to be automatically called. Define a POD
+    // buffer of the right size instead.
+    char stack_buffer_[sizeof(T[stack_capacity])];
+
+    // Set when the stack buffer is used for an allocation. We do not track
+    // how much of the buffer is used, only that somebody is using it.
+    bool used_stack_buffer_;
+  };
+
+  // Used by containers when they want to refer to an allocator of type U.
+  template<typename U>
+  struct rebind {
+    typedef StackAllocator<U, stack_capacity> other;
+  };
+
+  // For the straight up copy c-tor, we can share storage.
+  StackAllocator(const StackAllocator<T, stack_capacity>& rhs)
+      : source_(rhs.source_) {
+  }
+
+  // ISO C++ requires the following constructor to be defined,
+  // and std::vector in VC++2008SP1 Release fails with an error
+  // in the class _Container_base_aux_alloc_real (from <xutility>)
+  // if the constructor does not exist.
+  // For this constructor, we cannot share storage; there's
+  // no guarantee that the Source buffer of Ts is large enough
+  // for Us.
+  // TODO: If we were fancy pants, perhaps we could share storage
+  // iff sizeof(T) == sizeof(U).
+  template<typename U, size_t other_capacity>
+  StackAllocator(const StackAllocator<U, other_capacity>& other)
+      : source_(NULL) {
+  }
+
+  explicit StackAllocator(Source* source) : source_(source) {
+  }
+
+  // Actually do the allocation. Use the stack buffer if nobody has used it yet
+  // and the size requested fits. Otherwise, fall through to the standard
+  // allocator.
+  pointer allocate(size_type n, void* hint = 0) {
+    if (source_ != NULL && !source_->used_stack_buffer_
+        && n <= stack_capacity) {
+      source_->used_stack_buffer_ = true;
+      return source_->stack_buffer();
+    } else {
+      return std::allocator<T>::allocate(n, hint);
+    }
+  }
+
+  // Free: when trying to free the stack buffer, just mark it as free. For
+  // non-stack-buffer pointers, just fall though to the standard allocator.
+  void deallocate(pointer p, size_type n) {
+    if (source_ != NULL && p == source_->stack_buffer())
+      source_->used_stack_buffer_ = false;
+    else
+      std::allocator<T>::deallocate(p, n);
+  }
+
+ private:
+  Source* source_;
+};
+
+// A wrapper around STL containers that maintains a stack-sized buffer that the
+// initial capacity of the vector is based on. Growing the container beyond the
+// stack capacity will transparently overflow onto the heap. The container must
+// support reserve().
+//
+// WATCH OUT: the ContainerType MUST use the proper StackAllocator for this
+// type. This object is really intended to be used only internally. You'll want
+// to use the wrappers below for different types.
+template<typename TContainerType, int stack_capacity>
+class StackContainer {
+ public:
+  typedef TContainerType ContainerType;
+  typedef typename ContainerType::value_type ContainedType;
+  typedef StackAllocator<ContainedType, stack_capacity> Allocator;
+
+  // Allocator must be constructed before the container!
+  StackContainer() : allocator_(&stack_data_), container_(allocator_) {
+    // Make the container use the stack allocation by reserving our buffer size
+    // before doing anything else.
+    container_.reserve(stack_capacity);
+  }
+
+  // Getters for the actual container.
+  //
+  // Danger: any copies of this made using the copy constructor must have
+  // shorter lifetimes than the source. The copy will share the same allocator
+  // and therefore the same stack buffer as the original. Use std::copy to
+  // copy into a "real" container for longer-lived objects.
+  ContainerType& container() { return container_; }
+  const ContainerType& container() const { return container_; }
+
+  // Support operator-> to get to the container. This allows nicer syntax like:
+  //   StackContainer<...> foo;
+  //   std::sort(foo->begin(), foo->end());
+  ContainerType* operator->() { return &container_; }
+  const ContainerType* operator->() const { return &container_; }
+
+#ifdef UNIT_TEST
+  // Retrieves the stack source so that that unit tests can verify that the
+  // buffer is being used properly.
+  const typename Allocator::Source& stack_data() const {
+    return stack_data_;
+  }
+#endif
+
+ protected:
+  typename Allocator::Source stack_data_;
+  Allocator allocator_;
+  ContainerType container_;
+
+  DISALLOW_EVIL_CONSTRUCTORS(StackContainer);
+};
+
+// StackString
+template<size_t stack_capacity>
+class StackString : public StackContainer<
+    std::basic_string<char,
+                      std::char_traits<char>,
+                      StackAllocator<char, stack_capacity> >,
+    stack_capacity> {
+ public:
+  StackString() : StackContainer<
+      std::basic_string<char,
+                        std::char_traits<char>,
+                        StackAllocator<char, stack_capacity> >,
+      stack_capacity>() {
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(StackString);
+};
+
+// StackWString
+template<size_t stack_capacity>
+class StackWString : public StackContainer<
+    std::basic_string<wchar_t,
+                      std::char_traits<wchar_t>,
+                      StackAllocator<wchar_t, stack_capacity> >,
+    stack_capacity> {
+ public:
+  StackWString() : StackContainer<
+      std::basic_string<wchar_t,
+                        std::char_traits<wchar_t>,
+                        StackAllocator<wchar_t, stack_capacity> >,
+      stack_capacity>() {
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(StackWString);
+};
+
+// StackVector
+//
+// Example:
+//   StackVector<int, 16> foo;
+//   foo->push_back(22);  // we have overloaded operator->
+//   foo[0] = 10;         // as well as operator[]
+template<typename T, size_t stack_capacity>
+class StackVector : public StackContainer<
+    std::vector<T, StackAllocator<T, stack_capacity> >,
+    stack_capacity> {
+ public:
+  StackVector() : StackContainer<
+      std::vector<T, StackAllocator<T, stack_capacity> >,
+      stack_capacity>() {
+  }
+
+  // We need to put this in STL containers sometimes, which requires a copy
+  // constructor. We can't call the regular copy constructor because that will
+  // take the stack buffer from the original. Here, we create an empty object
+  // and make a stack buffer of its own.
+  StackVector(const StackVector<T, stack_capacity>& other)
+      : StackContainer<
+            std::vector<T, StackAllocator<T, stack_capacity> >,
+            stack_capacity>() {
+    this->container().assign(other->begin(), other->end());
+  }
+
+  StackVector<T, stack_capacity>& operator=(
+      const StackVector<T, stack_capacity>& other) {
+    this->container().assign(other->begin(), other->end());
+    return *this;
+  }
+
+  // Vectors are commonly indexed, which isn't very convenient even with
+  // operator-> (using "->at()" does exception stuff we don't want).
+  T& operator[](size_t i) { return this->container().operator[](i); }
+  const T& operator[](size_t i) const {
+    return this->container().operator[](i);
+  }
+};
+
+#endif  // BASE_STACK_CONTAINER_H_