Bug 486918. Part 1: Import Chromium's higher-quality image scalers, since we know those to be good and shippable. r=jrmuizel
authorJoe Drew <joe@drew.ca>
Thu, 23 Aug 2012 15:36:04 -0400
changeset 108787 a59944cc37813631d5c5d9847691a95bd3c0cdb3
parent 108786 bed067351ab6e0202b1b05b323c70bc59b84590b
child 108788 27e0c22b96e5ef918e0de7817ee572a26d0aa274
push id82
push usershu@rfrn.org
push dateFri, 05 Oct 2012 13:20:22 +0000
reviewersjrmuizel
bugs486918
milestone18.0a1
Bug 486918. Part 1: Import Chromium's higher-quality image scalers, since we know those to be good and shippable. r=jrmuizel
gfx/2d/HelpersSkia.h
gfx/2d/Makefile.in
gfx/2d/Scale.cpp
gfx/2d/Scale.h
gfx/2d/basictypes.h
gfx/2d/convolver.cpp
gfx/2d/convolver.h
gfx/2d/cpu.h
gfx/2d/image_operations.cpp
gfx/2d/image_operations.h
gfx/2d/port.h
gfx/2d/stack_container.h
--- a/gfx/2d/HelpersSkia.h
+++ b/gfx/2d/HelpersSkia.h
@@ -5,16 +5,17 @@
 
 #ifndef MOZILLA_GFX_HELPERSSKIA_H_
 #define MOZILLA_GFX_HELPERSSKIA_H_
 
 #include "2D.h"
 #include "skia/SkCanvas.h"
 #include "skia/SkDashPathEffect.h"
 #include "mozilla/Assertions.h"
+#include <vector>
 
 namespace mozilla {
 namespace gfx {
 
 static inline SkBitmap::Config
 GfxFormatToSkiaConfig(SurfaceFormat format)
 {
   switch (format)
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@@ -24,16 +24,17 @@ EXPORTS_mozilla/gfx	= \
         BaseMargin.h \
         BaseRect.h \
         BaseSize.h \
         Blur.h \
         PathHelpers.h \
         Point.h \
         Matrix.h \
         Rect.h \
+        Scale.h \
         Types.h \
         Tools.h \
         UserData.h \
 	$(NULL)
 
 CPPSRCS	= \
         Factory.cpp \
         Rect.cpp \
@@ -41,16 +42,17 @@ CPPSRCS	= \
         DrawTargetCairo.cpp \
         SourceSurfaceCairo.cpp \
         PathCairo.cpp \
         DrawTargetRecording.cpp \
         PathRecording.cpp \
         RecordedEvent.cpp \
         DrawEventRecorder.cpp \
         Blur.cpp \
+        Scale.cpp \
         ScaledFontBase.cpp \
         DrawTargetDual.cpp \
         ImageScaling.cpp \
         SourceSurfaceRawData.cpp \
         $(NULL)
 
 ifeq (cocoa,$(MOZ_WIDGET_TOOLKIT))
 CPPSRCS	+= \
@@ -71,16 +73,18 @@ endif
 
 DEFINES += -DMOZ_GFX -DUSE_CAIRO -DGFX2D_INTERNAL
 
 ifdef MOZ_ENABLE_SKIA
 CPPSRCS	+= \
         SourceSurfaceSkia.cpp \
         DrawTargetSkia.cpp \
         PathSkia.cpp \
+        convolver.cpp \
+        image_operations.cpp \
         $(NULL)
 
 DEFINES += -DUSE_SKIA
 
 endif
 
 ifeq (cocoa,$(MOZ_WIDGET_TOOLKIT))
 ifdef MOZ_ENABLE_SKIA
new file mode 100644
--- /dev/null
+++ b/gfx/2d/Scale.cpp
@@ -0,0 +1,54 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Scale.h"
+
+#ifdef USE_SKIA
+#include "HelpersSkia.h"
+#include "skia/SkBitmap.h"
+#include "image_operations.h"
+#endif
+
+namespace mozilla {
+namespace gfx {
+
+bool Scale(uint8_t* srcData, int32_t srcWidth, int32_t srcHeight, int32_t srcStride,
+           uint8_t* dstData, int32_t dstWidth, int32_t dstHeight, int32_t dstStride,
+           SurfaceFormat format)
+{
+#ifdef USE_SKIA
+  bool opaque;
+  if (format == FORMAT_B8G8R8A8) {
+    opaque = false;
+  } else {
+    opaque = true;
+  }
+
+  SkBitmap::Config config = GfxFormatToSkiaConfig(format);
+
+  SkBitmap imgSrc;
+  imgSrc.setConfig(config, srcWidth, srcHeight, srcStride);
+  imgSrc.setPixels(srcData);
+  imgSrc.setIsOpaque(opaque);
+
+  // Rescaler is compatible with 32 bpp only. Convert to RGB32 if needed.
+  if (config != SkBitmap::kARGB_8888_Config) {
+    imgSrc.copyTo(&imgSrc, SkBitmap::kARGB_8888_Config);
+  }
+
+  // This returns an SkBitmap backed by dstData; since it also wrote to dstData,
+  // we don't need to look at that SkBitmap.
+  SkBitmap result = skia::ImageOperations::Resize(imgSrc,
+                                                  skia::ImageOperations::RESIZE_BEST,
+                                                  dstWidth, dstHeight,
+                                                  dstData);
+
+  return result.readyToDraw();
+#else
+  return false;
+#endif
+}
+
+}
+}
new file mode 100644
--- /dev/null
+++ b/gfx/2d/Scale.h
@@ -0,0 +1,36 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_GFX_SCALE_H_
+#define MOZILLA_GFX_SCALE_H_
+
+#include "Types.h"
+
+namespace mozilla {
+namespace gfx {
+
+/**
+ * Scale an image using a high-quality filter.
+ *
+ * Synchronously scales an image and writes the output to the destination in
+ * 32-bit format. The destination must be pre-allocated by the caller.
+ *
+ * Returns true if scaling was successful, and false otherwise. Currently, this
+ * function is implemented using Skia. If Skia is not enabled when building,
+ * calling this function will always return false.
+ *
+ * IMPLEMTATION NOTES:
+ * This API is not currently easily hardware acceleratable. A better API might
+ * take a SourceSurface and return a SourceSurface; the Direct2D backend, for
+ * example, could simply set a status bit on a copy of the image, and use
+ * Direct2D's high-quality scaler at draw time.
+ */
+GFX2D_API bool Scale(uint8_t* srcData, int32_t srcWidth, int32_t srcHeight, int32_t srcStride,
+                     uint8_t* dstData, int32_t dstWidth, int32_t dstHeight, int32_t dstStride,
+                     SurfaceFormat format);
+
+}
+}
+
+#endif /* MOZILLA_GFX_BLUR_H_ */
new file mode 100644
--- /dev/null
+++ b/gfx/2d/basictypes.h
@@ -0,0 +1,357 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_BASICTYPES_H_
+#define BASE_BASICTYPES_H_
+
+// Chromium includes a prtypes.h also, but it has been modified to include
+// their build_config.h as well. We can therefore test for both to determine
+// if someone screws up the include order.
+#if defined(prtypes_h___) && !defined(BUILD_BUILD_CONFIG_H_)
+#error You_must_include_basictypes.h_before_prtypes.h!
+#endif
+
+#ifndef NO_NSPR_10_SUPPORT
+#define NO_NSPR_10_SUPPORT
+#define NO_NSPR_10_SUPPORT_SAVE
+#endif
+
+
+#ifdef NO_NSPR_10_SUPPORT_SAVE
+#undef NO_NSPR_10_SUPPORT_SAVE
+#undef NO_NSPR_10_SUPPORT
+#endif
+
+#ifdef _WIN32
+#undef _WIN32
+#define _WIN32_SAVE
+#endif
+
+
+#ifdef _WIN32_SAVE
+#undef _WIN32_SAVE
+#define _WIN32
+#endif
+
+#include <limits.h>         // So we can set the bounds of our types
+#include <stddef.h>         // For size_t
+#include <string.h>         // for memcpy
+
+//#include "base/port.h"    // Types that only need exist on certain systems
+
+#ifndef COMPILER_MSVC
+// stdint.h is part of C99 but MSVC doesn't have it.
+#include <stdint.h>         // For intptr_t.
+#endif
+typedef uint8_t uint8;
+typedef int16_t int16;
+#if 0
+// A type to represent a Unicode code-point value. As of Unicode 4.0,
+// such values require up to 21 bits.
+// (For type-checking on pointers, make this explicitly signed,
+// and it should always be the signed version of whatever int32 is.)
+typedef signed int         char32;
+
+const uint8  kuint8max  = (( uint8) 0xFF);
+const uint16 kuint16max = ((uint16) 0xFFFF);
+const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
+const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF));
+const  int8  kint8min   = ((  int8) 0x80);
+const  int8  kint8max   = ((  int8) 0x7F);
+const  int16 kint16min  = (( int16) 0x8000);
+const  int16 kint16max  = (( int16) 0x7FFF);
+const  int32 kint32min  = (( int32) 0x80000000);
+const  int32 kint32max  = (( int32) 0x7FFFFFFF);
+const  int64 kint64min  = (( int64) GG_LONGLONG(0x8000000000000000));
+const  int64 kint64max  = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF));
+#endif
+// Platform- and hardware-dependent printf specifiers
+#  if defined(OS_POSIX)
+#    define __STDC_FORMAT_MACROS 1
+#    include <inttypes.h>           // for 64-bit integer format macros
+#    define PRId64L "I64d"
+#    define PRIu64L "I64u"
+#    define PRIx64L "I64x"
+#  elif defined(OS_WIN)
+#    define PRId64 "I64d"
+#    define PRIu64 "I64u"
+#    define PRIx64 "I64x"
+#    define PRId64L L"I64d"
+#    define PRIu64L L"I64u"
+#    define PRIx64L L"I64x"
+#  endif
+
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&)
+
+// An older, deprecated, politically incorrect name for the above.
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// A macro to disallow all the implicit constructors, namely the
+// default constructor, copy constructor and operator= functions.
+//
+// This should be used in the private: declarations for a class
+// that wants to prevent anyone from instantiating it. This is
+// especially useful for classes containing only static methods.
+#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+  TypeName();                                    \
+  DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+// One caveat is that arraysize() doesn't accept any array of an
+// anonymous type or a type defined inside a function.  In these rare
+// cases, you have to use the unsafe ARRAYSIZE_UNSAFE() macro below.  This is
+// due to a limitation in C++'s template system.  The limitation might
+// eventually be removed, but it hasn't happened yet.
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef _MSC_VER
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// ARRAYSIZE_UNSAFE performs essentially the same calculation as arraysize,
+// but can be used on anonymous types or types defined inside
+// functions.  It's less safe than arraysize as it accepts some
+// (although not all) pointers.  Therefore, you should use arraysize
+// whenever possible.
+//
+// The expression ARRAYSIZE_UNSAFE(a) is a compile-time constant of type
+// size_t.
+//
+// ARRAYSIZE_UNSAFE catches a few type errors.  If you see a compiler error
+//
+//   "warning: division by zero in ..."
+//
+// when using ARRAYSIZE_UNSAFE, you are (wrongfully) giving it a pointer.
+// You should only use ARRAYSIZE_UNSAFE on statically allocated arrays.
+//
+// The following comments are on the implementation details, and can
+// be ignored by the users.
+//
+// ARRAYSIZE_UNSAFE(arr) works by inspecting sizeof(arr) (the # of bytes in
+// the array) and sizeof(*(arr)) (the # of bytes in one array
+// element).  If the former is divisible by the latter, perhaps arr is
+// indeed an array, in which case the division result is the # of
+// elements in the array.  Otherwise, arr cannot possibly be an array,
+// and we generate a compiler error to prevent the code from
+// compiling.
+//
+// Since the size of bool is implementation-defined, we need to cast
+// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
+// result has type size_t.
+//
+// This macro is not perfect as it wrongfully accepts certain
+// pointers, namely where the pointer size is divisible by the pointee
+// size.  Since all our code has to go through a 32-bit compiler,
+// where a pointer is 4 bytes, this means all pointers to a type whose
+// size is 3 or greater than 4 will be (righteously) rejected.
+
+#define ARRAYSIZE_UNSAFE(a) \
+  ((sizeof(a) / sizeof(*(a))) / \
+   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+
+
+// Use implicit_cast as a safe version of static_cast or const_cast
+// for upcasting in the type hierarchy (i.e. casting a pointer to Foo
+// to a pointer to SuperclassOfFoo or casting a pointer to Foo to
+// a const pointer to Foo).
+// When you use implicit_cast, the compiler checks that the cast is safe.
+// Such explicit implicit_casts are necessary in surprisingly many
+// situations where C++ demands an exact type match instead of an
+// argument type convertable to a target type.
+//
+// The From type can be inferred, so the preferred syntax for using
+// implicit_cast is the same as for static_cast etc.:
+//
+//   implicit_cast<ToType>(expr)
+//
+// implicit_cast would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+template<typename To, typename From>
+inline To implicit_cast(From const &f) {
+  return f;
+}
+
+// The COMPILE_ASSERT macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   COMPILE_ASSERT(ARRAYSIZE_UNSAFE(content_type_names) == CONTENT_NUM_TYPES,
+//                  content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#undef COMPILE_ASSERT
+#define COMPILE_ASSERT(expr, msg) \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// Implementation details of COMPILE_ASSERT:
+//
+// - COMPILE_ASSERT works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//     #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
+//                               // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     COMPILE_ASSERT(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+
+// MetatagId refers to metatag-id that we assign to
+// each metatag <name, value> pair..
+//typedef uint32 MetatagId;
+
+// Argument type used in interfaces that can optionally take ownership
+// of a passed in argument.  If TAKE_OWNERSHIP is passed, the called
+// object takes ownership of the argument.  Otherwise it does not.
+enum Ownership {
+  DO_NOT_TAKE_OWNERSHIP,
+  TAKE_OWNERSHIP
+};
+
+// bit_cast<Dest,Source> is a template function that implements the
+// equivalent of "*reinterpret_cast<Dest*>(&source)".  We need this in
+// very low-level functions like the protobuf library and fast math
+// support.
+//
+//   float f = 3.14159265358979;
+//   int i = bit_cast<int32>(f);
+//   // i = 0x40490fdb
+//
+// The classical address-casting method is:
+//
+//   // WRONG
+//   float f = 3.14159265358979;            // WRONG
+//   int i = * reinterpret_cast<int*>(&f);  // WRONG
+//
+// The address-casting method actually produces undefined behavior
+// according to ISO C++ specification section 3.10 -15 -.  Roughly, this
+// section says: if an object in memory has one type, and a program
+// accesses it with a different type, then the result is undefined
+// behavior for most values of "different type".
+//
+// This is true for any cast syntax, either *(int*)&f or
+// *reinterpret_cast<int*>(&f).  And it is particularly true for
+// conversions betweeen integral lvalues and floating-point lvalues.
+//
+// The purpose of 3.10 -15- is to allow optimizing compilers to assume
+// that expressions with different types refer to different memory.  gcc
+// 4.0.1 has an optimizer that takes advantage of this.  So a
+// non-conforming program quietly produces wildly incorrect output.
+//
+// The problem is not the use of reinterpret_cast.  The problem is type
+// punning: holding an object in memory of one type and reading its bits
+// back using a different type.
+//
+// The C++ standard is more subtle and complex than this, but that
+// is the basic idea.
+//
+// Anyways ...
+//
+// bit_cast<> calls memcpy() which is blessed by the standard,
+// especially by the example in section 3.9 .  Also, of course,
+// bit_cast<> wraps up the nasty logic in one place.
+//
+// Fortunately memcpy() is very fast.  In optimized mode, with a
+// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
+// code with the minimal amount of data movement.  On a 32-bit system,
+// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
+// compiles to two loads and two stores.
+//
+// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
+//
+// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
+// is likely to surprise you.
+
+template <class Dest, class Source>
+inline Dest bit_cast(const Source& source) {
+  // Compile time assertion: sizeof(Dest) == sizeof(Source)
+  // A compile error here means your Dest and Source have different sizes.
+  typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
+
+  Dest dest;
+  memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+// The following enum should be used only as a constructor argument to indicate
+// that the variable has static storage class, and that the constructor should
+// do nothing to its state.  It indicates to the reader that it is legal to
+// declare a static instance of the class, provided the constructor is given
+// the base::LINKER_INITIALIZED argument.  Normally, it is unsafe to declare a
+// static variable that has a constructor or a destructor because invocation
+// order is undefined.  However, IF the type can be initialized by filling with
+// zeroes (which the loader does for static variables), AND the destructor also
+// does nothing to the storage, AND there are no virtual methods, then a
+// constructor declared as
+//       explicit MyClass(base::LinkerInitialized x) {}
+// and invoked as
+//       static MyClass my_variable_name(base::LINKER_INITIALIZED);
+namespace base {
+enum LinkerInitialized { LINKER_INITIALIZED };
+}  // base
+
+
+
+
+#endif  // BASE_BASICTYPES_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/convolver.cpp
@@ -0,0 +1,864 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "convolver.h"
+
+#include <algorithm>
+#include "nsAlgorithm.h"
+
+#include "skia/SkTypes.h"
+
+// note: SIMD_SSE2 is not enabled because of bugs, apparently
+
+#if defined(SIMD_SSE2)
+#include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h
+#endif
+
+namespace skia {
+
+namespace {
+
+// Converts the argument to an 8-bit unsigned value by clamping to the range
+// 0-255.
+inline unsigned char ClampTo8(int a) {
+  if (static_cast<unsigned>(a) < 256)
+    return a;  // Avoid the extra check in the common case.
+  if (a < 0)
+    return 0;
+  return 255;
+}
+
+// Stores a list of rows in a circular buffer. The usage is you write into it
+// by calling AdvanceRow. It will keep track of which row in the buffer it
+// should use next, and the total number of rows added.
+class CircularRowBuffer {
+ public:
+  // The number of pixels in each row is given in |source_row_pixel_width|.
+  // The maximum number of rows needed in the buffer is |max_y_filter_size|
+  // (we only need to store enough rows for the biggest filter).
+  //
+  // We use the |first_input_row| to compute the coordinates of all of the
+  // following rows returned by Advance().
+  CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
+                    int first_input_row)
+      : row_byte_width_(dest_row_pixel_width * 4),
+        num_rows_(max_y_filter_size),
+        next_row_(0),
+        next_row_coordinate_(first_input_row) {
+    buffer_.resize(row_byte_width_ * max_y_filter_size);
+    row_addresses_.resize(num_rows_);
+  }
+
+  // Moves to the next row in the buffer, returning a pointer to the beginning
+  // of it.
+  unsigned char* AdvanceRow() {
+    unsigned char* row = &buffer_[next_row_ * row_byte_width_];
+    next_row_coordinate_++;
+
+    // Set the pointer to the next row to use, wrapping around if necessary.
+    next_row_++;
+    if (next_row_ == num_rows_)
+      next_row_ = 0;
+    return row;
+  }
+
+  // Returns a pointer to an "unrolled" array of rows. These rows will start
+  // at the y coordinate placed into |*first_row_index| and will continue in
+  // order for the maximum number of rows in this circular buffer.
+  //
+  // The |first_row_index_| may be negative. This means the circular buffer
+  // starts before the top of the image (it hasn't been filled yet).
+  unsigned char* const* GetRowAddresses(int* first_row_index) {
+    // Example for a 4-element circular buffer holding coords 6-9.
+    //   Row 0   Coord 8
+    //   Row 1   Coord 9
+    //   Row 2   Coord 6  <- next_row_ = 2, next_row_coordinate_ = 10.
+    //   Row 3   Coord 7
+    //
+    // The "next" row is also the first (lowest) coordinate. This computation
+    // may yield a negative value, but that's OK, the math will work out
+    // since the user of this buffer will compute the offset relative
+    // to the first_row_index and the negative rows will never be used.
+    *first_row_index = next_row_coordinate_ - num_rows_;
+
+    int cur_row = next_row_;
+    for (int i = 0; i < num_rows_; i++) {
+      row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
+
+      // Advance to the next row, wrapping if necessary.
+      cur_row++;
+      if (cur_row == num_rows_)
+        cur_row = 0;
+    }
+    return &row_addresses_[0];
+  }
+
+ private:
+  // The buffer storing the rows. They are packed, each one row_byte_width_.
+  std::vector<unsigned char> buffer_;
+
+  // Number of bytes per row in the |buffer_|.
+  int row_byte_width_;
+
+  // The number of rows available in the buffer.
+  int num_rows_;
+
+  // The next row index we should write into. This wraps around as the
+  // circular buffer is used.
+  int next_row_;
+
+  // The y coordinate of the |next_row_|. This is incremented each time a
+  // new row is appended and does not wrap.
+  int next_row_coordinate_;
+
+  // Buffer used by GetRowAddresses().
+  std::vector<unsigned char*> row_addresses_;
+};
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+template<bool has_alpha>
+void ConvolveHorizontally(const unsigned char* src_data,
+                          const ConvolutionFilter1D& filter,
+                          unsigned char* out_row) {
+  // Loop over each pixel on this row in the output image.
+  int num_values = filter.num_values();
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    // Get the filter that determines the current output pixel.
+    int filter_offset, filter_length;
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const unsigned char* row_to_filter = &src_data[filter_offset * 4];
+
+    // Apply the filter to the row to get the destination pixel in |accum|.
+    int accum[4] = {0};
+    for (int filter_x = 0; filter_x < filter_length; filter_x++) {
+      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
+      accum[0] += cur_filter * row_to_filter[filter_x * 4 + 0];
+      accum[1] += cur_filter * row_to_filter[filter_x * 4 + 1];
+      accum[2] += cur_filter * row_to_filter[filter_x * 4 + 2];
+      if (has_alpha)
+        accum[3] += cur_filter * row_to_filter[filter_x * 4 + 3];
+    }
+
+    // Bring this value back in range. All of the filter scaling factors
+    // are in fixed point with kShiftBits bits of fractional part.
+    accum[0] >>= ConvolutionFilter1D::kShiftBits;
+    accum[1] >>= ConvolutionFilter1D::kShiftBits;
+    accum[2] >>= ConvolutionFilter1D::kShiftBits;
+    if (has_alpha)
+      accum[3] >>= ConvolutionFilter1D::kShiftBits;
+
+    // Store the new pixel.
+    out_row[out_x * 4 + 0] = ClampTo8(accum[0]);
+    out_row[out_x * 4 + 1] = ClampTo8(accum[1]);
+    out_row[out_x * 4 + 2] = ClampTo8(accum[2]);
+    if (has_alpha)
+      out_row[out_x * 4 + 3] = ClampTo8(accum[3]);
+  }
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
+                        int filter_length,
+                        unsigned char* const* source_data_rows,
+                        int pixel_width,
+                        unsigned char* out_row) {
+  // We go through each column in the output and do a vertical convolution,
+  // generating one output pixel each time.
+  for (int out_x = 0; out_x < pixel_width; out_x++) {
+    // Compute the number of bytes over in each row that the current column
+    // we're convolving starts at. The pixel will cover the next 4 bytes.
+    int byte_offset = out_x * 4;
+
+    // Apply the filter to one column of pixels.
+    int accum[4] = {0};
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
+      accum[0] += cur_filter * source_data_rows[filter_y][byte_offset + 0];
+      accum[1] += cur_filter * source_data_rows[filter_y][byte_offset + 1];
+      accum[2] += cur_filter * source_data_rows[filter_y][byte_offset + 2];
+      if (has_alpha)
+        accum[3] += cur_filter * source_data_rows[filter_y][byte_offset + 3];
+    }
+
+    // Bring this value back in range. All of the filter scaling factors
+    // are in fixed point with kShiftBits bits of precision.
+    accum[0] >>= ConvolutionFilter1D::kShiftBits;
+    accum[1] >>= ConvolutionFilter1D::kShiftBits;
+    accum[2] >>= ConvolutionFilter1D::kShiftBits;
+    if (has_alpha)
+      accum[3] >>= ConvolutionFilter1D::kShiftBits;
+
+    // Store the new pixel.
+    out_row[byte_offset + 0] = ClampTo8(accum[0]);
+    out_row[byte_offset + 1] = ClampTo8(accum[1]);
+    out_row[byte_offset + 2] = ClampTo8(accum[2]);
+    if (has_alpha) {
+      unsigned char alpha = ClampTo8(accum[3]);
+
+      // Make sure the alpha channel doesn't come out smaller than any of the
+      // color channels. We use premultipled alpha channels, so this should
+      // never happen, but rounding errors will cause this from time to time.
+      // These "impossible" colors will cause overflows (and hence random pixel
+      // values) when the resulting bitmap is drawn to the screen.
+      //
+      // We only need to do this when generating the final output row (here).
+      int max_color_channel = NS_MAX(out_row[byte_offset + 0],
+          NS_MAX(out_row[byte_offset + 1], out_row[byte_offset + 2]));
+      if (alpha < max_color_channel)
+        out_row[byte_offset + 3] = max_color_channel;
+      else
+        out_row[byte_offset + 3] = alpha;
+    } else {
+      // No alpha channel, the image is opaque.
+      out_row[byte_offset + 3] = 0xff;
+    }
+  }
+}
+
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+void ConvolveHorizontally_SSE2(const unsigned char* src_data,
+                               const ConvolutionFilter1D& filter,
+                               unsigned char* out_row) {
+#if defined(SIMD_SSE2)
+  int num_values = filter.num_values();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    __m128i accum = _mm_setzero_si128();
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const __m128i* row_to_filter =
+        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+      __m128i coeff, coeff16;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Load four pixels => unpack the first two pixels to 16 bits =>
+      // multiply with coefficients => accumulate the convolution result.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Duplicate 3rd and 4th coefficients for all channels =>
+      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+      // => accumulate the convolution results.
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      // [16] a3 g3 b3 r3 a2 g2 b2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Advance the pixel and coefficients pointers.
+      row_to_filter += 1;
+      filter_values += 4;
+    }
+
+    // When |filter_length| is not divisible by 4, we need to decimate some of
+    // the filter coefficient that was loaded incorrectly to zero; Other than
+    // that the algorithm is same with above, exceot that the 4th pixel will be
+    // always absent.
+    int r = filter_length&3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8).
+      __m128i coeff, coeff16;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Note: line buffer must be padded to align_up(filter_offset, 16).
+      // We resolve this by use C-version for the last horizontal line.
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    accum = _mm_packs_epi32(accum, zero);
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    accum = _mm_packus_epi16(accum, zero);
+
+    // Store the pixel value of 32 bits.
+    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+    out_row += 4;
+  }
+#endif
+}
+
+// Convolves horizontally along four rows. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
+// refer to that function for detailed comments.
+void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
+                                const ConvolutionFilter1D& filter,
+                                unsigned char* out_row[4]) {
+#if defined(SIMD_SSE2)
+  int num_values = filter.num_values();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const ConvolutionFilter1D::Fixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // four pixels in a column per iteration.
+    __m128i accum0 = _mm_setzero_si128();
+    __m128i accum1 = _mm_setzero_si128();
+    __m128i accum2 = _mm_setzero_si128();
+    __m128i accum3 = _mm_setzero_si128();
+    int start = (filter_offset<<2);
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+      __m128i coeff, coeff16lo, coeff16hi;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum)                                          \
+      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
+      src16 = _mm_unpacklo_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      src16 = _mm_unpackhi_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t)
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+
+      start += 16;
+      filter_values += 4;
+    }
+
+    int r = filter_length & 3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8);
+      __m128i coeff;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+
+      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      /* c1 c1 c1 c1 c0 c0 c0 c0 */
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum0 = _mm_packs_epi32(accum0, zero);
+    accum0 = _mm_packus_epi16(accum0, zero);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_packs_epi32(accum1, zero);
+    accum1 = _mm_packus_epi16(accum1, zero);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_packs_epi32(accum2, zero);
+    accum2 = _mm_packus_epi16(accum2, zero);
+    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_packs_epi32(accum3, zero);
+    accum3 = _mm_packus_epi16(accum3, zero);
+
+    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+    out_row[0] += 4;
+    out_row[1] += 4;
+    out_row[2] += 4;
+    out_row[3] += 4;
+  }
+#endif
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
+                             int filter_length,
+                             unsigned char* const* source_data_rows,
+                             int pixel_width,
+                             unsigned char* out_row) {
+#if defined(SIMD_SSE2)
+  int width = pixel_width & ~3;
+
+  __m128i zero = _mm_setzero_si128();
+  __m128i accum0, accum1, accum2, accum3, coeff16;
+  const __m128i* src;
+  // Output four pixels per iteration (16 bytes).
+  for (int out_x = 0; out_x < width; out_x += 4) {
+
+    // Accumulated result for each pixel. 32 bits per RGBA channel.
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    accum3 = _mm_setzero_si128();
+
+    // Convolve with one filter coefficient per iteration.
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+      // Duplicate the filter coefficient 8 times.
+      // [16] cj cj cj cj cj cj cj cj
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+      // Load four pixels (16 bytes) together.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][out_x << 2]);
+      __m128i src8 = _mm_loadu_si128(src);
+
+      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+
+      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+      // [32] a3 b3 g3 r3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum3 = _mm_add_epi32(accum3, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, accum3);
+
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+
+    if (has_alpha) {
+      // Compute the max(ri, gi, bi) for each pixel.
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+
+      // Make sure the value of alpha channel is always larger than maximum
+      // value of color channels.
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      // Set value of alpha channels to 0xFF.
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    // Store the convolution result (16 bytes) and advance the pixel pointers.
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+    out_row += 16;
+  }
+
+  // When the width of the output is not divisible by 4, We need to save one
+  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+  if (pixel_width & 3) {
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][width<<2]);
+      __m128i src8 = _mm_loadu_si128(src);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, zero);
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+    if (has_alpha) {
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    for (int out_x = width; out_x < pixel_width; out_x++) {
+      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+      accum0 = _mm_srli_si128(accum0, 4);
+      out_row += 4;
+    }
+  }
+#endif
+}
+
+}  // namespace
+
+// ConvolutionFilter1D ---------------------------------------------------------
+
+ConvolutionFilter1D::ConvolutionFilter1D()
+    : max_filter_(0) {
+}
+
+ConvolutionFilter1D::~ConvolutionFilter1D() {
+}
+
+void ConvolutionFilter1D::AddFilter(int filter_offset,
+                                    const float* filter_values,
+                                    int filter_length) {
+  SkASSERT(filter_length > 0);
+
+  std::vector<Fixed> fixed_values;
+  fixed_values.reserve(filter_length);
+
+  for (int i = 0; i < filter_length; ++i)
+    fixed_values.push_back(FloatToFixed(filter_values[i]));
+
+  AddFilter(filter_offset, &fixed_values[0], filter_length);
+}
+
+void ConvolutionFilter1D::AddFilter(int filter_offset,
+                                    const Fixed* filter_values,
+                                    int filter_length) {
+  // It is common for leading/trailing filter values to be zeros. In such
+  // cases it is beneficial to only store the central factors.
+  // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
+  // a 1080p image this optimization gives a ~10% speed improvement.
+  int first_non_zero = 0;
+  while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
+    first_non_zero++;
+
+  if (first_non_zero < filter_length) {
+    // Here we have at least one non-zero factor.
+    int last_non_zero = filter_length - 1;
+    while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
+      last_non_zero--;
+
+    filter_offset += first_non_zero;
+    filter_length = last_non_zero + 1 - first_non_zero;
+    SkASSERT(filter_length > 0);
+
+    for (int i = first_non_zero; i <= last_non_zero; i++)
+      filter_values_.push_back(filter_values[i]);
+  } else {
+    // Here all the factors were zeroes.
+    filter_length = 0;
+  }
+
+  FilterInstance instance;
+
+  // We pushed filter_length elements onto filter_values_
+  instance.data_location = (static_cast<int>(filter_values_.size()) -
+                            filter_length);
+  instance.offset = filter_offset;
+  instance.length = filter_length;
+  filters_.push_back(instance);
+
+  max_filter_ = NS_MAX(max_filter_, filter_length);
+}
+
+void BGRAConvolve2D(const unsigned char* source_data,
+                    int source_byte_row_stride,
+                    bool source_has_alpha,
+                    const ConvolutionFilter1D& filter_x,
+                    const ConvolutionFilter1D& filter_y,
+                    int output_byte_row_stride,
+                    unsigned char* output,
+                    bool use_sse2) {
+#if !defined(SIMD_SSE2)
+  // Even we have runtime support for SSE2 instructions, since the binary
+  // was not built with SSE2 support, we had to fallback to C version.
+  use_sse2 = false;
+#endif
+
+  int max_y_filter_size = filter_y.max_filter();
+
+  // The next row in the input that we will generate a horizontally
+  // convolved row for. If the filter doesn't start at the beginning of the
+  // image (this is the case when we are only resizing a subset), then we
+  // don't want to generate any output rows before that. Compute the starting
+  // row for convolution as the first pixel for the first vertical filter.
+  int filter_offset, filter_length;
+  const ConvolutionFilter1D::Fixed* filter_values =
+      filter_y.FilterForValue(0, &filter_offset, &filter_length);
+  int next_x_row = filter_offset;
+
+  // We loop over each row in the input doing a horizontal convolution. This
+  // will result in a horizontally convolved image. We write the results into
+  // a circular buffer of convolved rows and do vertical convolution as rows
+  // are available. This prevents us from having to store the entire
+  // intermediate image and helps cache coherency.
+  // We will need four extra rows to allow horizontal convolution could be done
+  // simultaneously. We also padding each row in row buffer to be aligned-up to
+  // 16 bytes.
+  // TODO(jiesun): We do not use aligned load from row buffer in vertical
+  // convolution pass yet. Somehow Windows does not like it.
+  int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
+  int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);
+  CircularRowBuffer row_buffer(row_buffer_width,
+                               row_buffer_height,
+                               filter_offset);
+
+  // Loop over every possible output row, processing just enough horizontal
+  // convolutions to run each subsequent vertical convolution.
+  SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
+  int num_output_rows = filter_y.num_values();
+
+  // We need to check which is the last line to convolve before we advance 4
+  // lines in one iteration.
+  int last_filter_offset, last_filter_length;
+  filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
+                          &last_filter_length);
+
+  for (int out_y = 0; out_y < num_output_rows; out_y++) {
+    filter_values = filter_y.FilterForValue(out_y,
+                                            &filter_offset, &filter_length);
+
+    // Generate output rows until we have enough to run the current filter.
+    if (use_sse2) {
+      while (next_x_row < filter_offset + filter_length) {
+        if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
+          const unsigned char* src[4];
+          unsigned char* out_row[4];
+          for (int i = 0; i < 4; ++i) {
+            src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
+            out_row[i] = row_buffer.AdvanceRow();
+          }
+          ConvolveHorizontally4_SSE2(src, filter_x, out_row);
+          next_x_row += 4;
+        } else {
+          // For the last row, SSE2 load possibly to access data beyond the
+          // image area. therefore we use C version here. 
+          if (next_x_row == last_filter_offset + last_filter_length - 1) {
+            if (source_has_alpha) {
+              ConvolveHorizontally<true>(
+                  &source_data[next_x_row * source_byte_row_stride],
+                  filter_x, row_buffer.AdvanceRow());
+            } else {
+              ConvolveHorizontally<false>(
+                  &source_data[next_x_row * source_byte_row_stride],
+                  filter_x, row_buffer.AdvanceRow());
+            }
+          } else {
+            ConvolveHorizontally_SSE2(
+                &source_data[next_x_row * source_byte_row_stride],
+                filter_x, row_buffer.AdvanceRow());
+          }
+          next_x_row++;
+        }
+      }
+    } else {
+      while (next_x_row < filter_offset + filter_length) {
+        if (source_has_alpha) {
+          ConvolveHorizontally<true>(
+              &source_data[next_x_row * source_byte_row_stride],
+              filter_x, row_buffer.AdvanceRow());
+        } else {
+          ConvolveHorizontally<false>(
+              &source_data[next_x_row * source_byte_row_stride],
+              filter_x, row_buffer.AdvanceRow());
+        }
+        next_x_row++;
+      }
+    }
+
+    // Compute where in the output image this row of final data will go.
+    unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
+
+    // Get the list of rows that the circular buffer has, in order.
+    int first_row_in_circular_buffer;
+    unsigned char* const* rows_to_convolve =
+        row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
+
+    // Now compute the start of the subset of those rows that the filter
+    // needs.
+    unsigned char* const* first_row_for_filter =
+        &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
+
+    if (source_has_alpha) {
+      if (use_sse2) {
+        ConvolveVertically_SSE2<true>(filter_values, filter_length,
+                                      first_row_for_filter,
+                                      filter_x.num_values(), cur_output_row);
+      } else {
+        ConvolveVertically<true>(filter_values, filter_length,
+                                 first_row_for_filter,
+                                 filter_x.num_values(), cur_output_row);
+      }
+    } else {
+      if (use_sse2) {
+        ConvolveVertically_SSE2<false>(filter_values, filter_length,
+                                       first_row_for_filter,
+                                       filter_x.num_values(), cur_output_row);
+      } else {
+        ConvolveVertically<false>(filter_values, filter_length,
+                                 first_row_for_filter,
+                                 filter_x.num_values(), cur_output_row);
+      }
+    }
+  }
+}
+
+}  // namespace skia
new file mode 100644
--- /dev/null
+++ b/gfx/2d/convolver.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SKIA_EXT_CONVOLVER_H_
+#define SKIA_EXT_CONVOLVER_H_
+
+#include <cmath>
+#include <vector>
+
+#include "basictypes.h"
+#include "prtypes.h"
+#include "cpu.h"
+#include "skia/SkTypes.h"
+
+// avoid confusion with Mac OS X's math library (Carbon)
+#if defined(__APPLE__)
+#undef FloatToFixed
+#undef FixedToFloat
+#endif
+
+namespace skia {
+
+// Represents a filter in one dimension. Each output pixel has one entry in this
+// object for the filter values contributing to it. You build up the filter
+// list by calling AddFilter for each output pixel (in order).
+//
+// We do 2-dimensional convolution by first convolving each row by one
+// ConvolutionFilter1D, then convolving each column by another one.
+//
+// Entries are stored in fixed point, shifted left by kShiftBits.
+class ConvolutionFilter1D {
+ public:
+  typedef short Fixed;
+
+  // The number of bits that fixed point values are shifted by.
+  enum { kShiftBits = 14 };
+
+  ConvolutionFilter1D();
+  ~ConvolutionFilter1D();
+
+  // Convert between floating point and our fixed point representation.
+  static Fixed FloatToFixed(float f) {
+    return static_cast<Fixed>(f * (1 << kShiftBits));
+  }
+  static unsigned char FixedToChar(Fixed x) {
+    return static_cast<unsigned char>(x >> kShiftBits);
+  }
+  static float FixedToFloat(Fixed x) {
+    // The cast relies on Fixed being a short, implying that on
+    // the platforms we care about all (16) bits will fit into
+    // the mantissa of a (32-bit) float.
+    COMPILE_ASSERT(sizeof(Fixed) == 2, fixed_type_should_fit_in_float_mantissa);
+    float raw = static_cast<float>(x);
+    return ldexpf(raw, -kShiftBits);
+  }
+
+  // Returns the maximum pixel span of a filter.
+  int max_filter() const { return max_filter_; }
+
+  // Returns the number of filters in this filter. This is the dimension of the
+  // output image.
+  int num_values() const { return static_cast<int>(filters_.size()); }
+
+  // Appends the given list of scaling values for generating a given output
+  // pixel. |filter_offset| is the distance from the edge of the image to where
+  // the scaling factors start. The scaling factors apply to the source pixels
+  // starting from this position, and going for the next |filter_length| pixels.
+  //
+  // You will probably want to make sure your input is normalized (that is,
+  // all entries in |filter_values| sub to one) to prevent affecting the overall
+  // brighness of the image.
+  //
+  // The filter_length must be > 0.
+  //
+  // This version will automatically convert your input to fixed point.
+  void AddFilter(int filter_offset,
+                        const float* filter_values,
+                        int filter_length);
+
+  // Same as the above version, but the input is already fixed point.
+  void AddFilter(int filter_offset,
+                 const Fixed* filter_values,
+                 int filter_length);
+
+  // Retrieves a filter for the given |value_offset|, a position in the output
+  // image in the direction we're convolving. The offset and length of the
+  // filter values are put into the corresponding out arguments (see AddFilter
+  // above for what these mean), and a pointer to the first scaling factor is
+  // returned. There will be |filter_length| values in this array.
+  inline const Fixed* FilterForValue(int value_offset,
+                                     int* filter_offset,
+                                     int* filter_length) const {
+    const FilterInstance& filter = filters_[value_offset];
+    *filter_offset = filter.offset;
+    *filter_length = filter.length;
+    if (filter.length == 0) {
+      return NULL;
+    }
+    return &filter_values_[filter.data_location];
+  }
+
+
+  inline void PaddingForSIMD(int padding_count) {
+    // Padding |padding_count| of more dummy coefficients after the coefficients
+    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
+    // together to access invalid memory areas. We are not trying to align the
+    // coefficients right now due to the opaqueness of <vector> implementation.
+    // This has to be done after all |AddFilter| calls.
+    for (int i = 0; i < padding_count; ++i)
+      filter_values_.push_back(static_cast<Fixed>(0));
+  }
+
+ private:
+  struct FilterInstance {
+    // Offset within filter_values for this instance of the filter.
+    int data_location;
+
+    // Distance from the left of the filter to the center. IN PIXELS
+    int offset;
+
+    // Number of values in this filter instance.
+    int length;
+  };
+
+  // Stores the information for each filter added to this class.
+  std::vector<FilterInstance> filters_;
+
+  // We store all the filter values in this flat list, indexed by
+  // |FilterInstance.data_location| to avoid the mallocs required for storing
+  // each one separately.
+  std::vector<Fixed> filter_values_;
+
+  // The maximum size of any filter we've added.
+  int max_filter_;
+};
+
+// Does a two-dimensional convolution on the given source image.
+//
+// It is assumed the source pixel offsets referenced in the input filters
+// reference only valid pixels, so the source image size is not required. Each
+// row of the source image starts |source_byte_row_stride| after the previous
+// one (this allows you to have rows with some padding at the end).
+//
+// The result will be put into the given output buffer. The destination image
+// size will be xfilter.num_values() * yfilter.num_values() pixels. It will be
+// in rows of exactly xfilter.num_values() * 4 bytes.
+//
+// |source_has_alpha| is a hint that allows us to avoid doing computations on
+// the alpha channel if the image is opaque. If you don't know, set this to
+// true and it will work properly, but setting this to false will be a few
+// percent faster if you know the image is opaque.
+//
+// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
+// (this is ARGB when loaded into 32-bit words on a little-endian machine).
+void BGRAConvolve2D(const unsigned char* source_data,
+                    int source_byte_row_stride,
+                    bool source_has_alpha,
+                    const ConvolutionFilter1D& xfilter,
+                    const ConvolutionFilter1D& yfilter,
+                    int output_byte_row_stride,
+                    unsigned char* output,
+                    bool use_sse2);
+}  // namespace skia
+
+#endif  // SKIA_EXT_CONVOLVER_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/cpu.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_CPU_H_
+#define BASE_CPU_H_
+
+#include <string>
+
+namespace base {
+
+// Query information about the processor.
+class CPU {
+ public:
+  // Constructor
+  CPU();
+
+  // Accessors for CPU information.
+  const std::string& vendor_name() const { return cpu_vendor_; }
+  int stepping() const { return stepping_; }
+  int model() const { return model_; }
+  int family() const { return family_; }
+  int type() const { return type_; }
+  int extended_model() const { return ext_model_; }
+  int extended_family() const { return ext_family_; }
+
+ private:
+  // Query the processor for CPUID information.
+  void Initialize();
+
+  int type_;  // process type
+  int family_;  // family of the processor
+  int model_;  // model of processor
+  int stepping_;  // processor revision number
+  int ext_model_;
+  int ext_family_;
+  std::string cpu_vendor_;
+};
+
+}  // namespace base
+
+#endif  // BASE_CPU_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/image_operations.cpp
@@ -0,0 +1,536 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "basictypes.h"
+
+#define _USE_MATH_DEFINES
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "image_operations.h"
+
+#include "nsAlgorithm.h"
+#include "stack_container.h"
+#include "convolver.h"
+#include "skia/SkColorPriv.h"
+#include "skia/SkBitmap.h"
+#include "skia/SkRect.h"
+#include "skia/SkFontHost.h"
+
+namespace skia {
+
+namespace {
+
+// Returns the ceiling/floor as an integer.
+inline int CeilInt(float val) {
+  return static_cast<int>(ceil(val));
+}
+inline int FloorInt(float val) {
+  return static_cast<int>(floor(val));
+}
+
+// Filter function computation -------------------------------------------------
+
+// Evaluates the box filter, which goes from -0.5 to +0.5.
+float EvalBox(float x) {
+  return (x >= -0.5f && x < 0.5f) ? 1.0f : 0.0f;
+}
+
+// Evaluates the Lanczos filter of the given filter size window for the given
+// position.
+//
+// |filter_size| is the width of the filter (the "window"), outside of which
+// the value of the function is 0. Inside of the window, the value is the
+// normalized sinc function:
+//   lanczos(x) = sinc(x) * sinc(x / filter_size);
+// where
+//   sinc(x) = sin(pi*x) / (pi*x);
+float EvalLanczos(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -std::numeric_limits<float>::epsilon() &&
+      x < std::numeric_limits<float>::epsilon())
+    return 1.0f;  // Special case the discontinuity at the origin.
+  float xpi = x * static_cast<float>(M_PI);
+  return (sin(xpi) / xpi) *  // sinc(x)
+          sin(xpi / filter_size) / (xpi / filter_size);  // sinc(x/filter_size)
+}
+
+// Evaluates the Hamming filter of the given filter size window for the given
+// position.
+//
+// The filter covers [-filter_size, +filter_size]. Outside of this window
+// the value of the function is 0. Inside of the window, the value is sinus
+// cardinal multiplied by a recentered Hamming function. The traditional
+// Hamming formula for a window of size N and n ranging in [0, N-1] is:
+//   hamming(n) = 0.54 - 0.46 * cos(2 * pi * n / (N-1)))
+// In our case we want the function centered for x == 0 and at its minimum
+// on both ends of the window (x == +/- filter_size), hence the adjusted
+// formula:
+//   hamming(x) = (0.54 -
+//                 0.46 * cos(2 * pi * (x - filter_size)/ (2 * filter_size)))
+//              = 0.54 - 0.46 * cos(pi * x / filter_size - pi)
+//              = 0.54 + 0.46 * cos(pi * x / filter_size)
+float EvalHamming(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -std::numeric_limits<float>::epsilon() &&
+      x < std::numeric_limits<float>::epsilon())
+    return 1.0f;  // Special case the sinc discontinuity at the origin.
+  const float xpi = x * static_cast<float>(M_PI);
+
+  return ((sin(xpi) / xpi) *  // sinc(x)
+          (0.54f + 0.46f * cos(xpi / filter_size)));  // hamming(x)
+}
+
+// ResizeFilter ----------------------------------------------------------------
+
+// Encapsulates computation and storage of the filters required for one complete
+// resize operation.
+class ResizeFilter {
+ public:
+  ResizeFilter(ImageOperations::ResizeMethod method,
+               int src_full_width, int src_full_height,
+               int dest_width, int dest_height,
+               const SkIRect& dest_subset);
+
+  // Returns the filled filter values.
+  const ConvolutionFilter1D& x_filter() { return x_filter_; }
+  const ConvolutionFilter1D& y_filter() { return y_filter_; }
+
+ private:
+  // Returns the number of pixels that the filer spans, in filter space (the
+  // destination image).
+  float GetFilterSupport(float scale) {
+    switch (method_) {
+      case ImageOperations::RESIZE_BOX:
+        // The box filter just scales with the image scaling.
+        return 0.5f;  // Only want one side of the filter = /2.
+      case ImageOperations::RESIZE_HAMMING1:
+        // The Hamming filter takes as much space in the source image in
+        // each direction as the size of the window = 1 for Hamming1.
+        return 1.0f;
+      case ImageOperations::RESIZE_LANCZOS2:
+        // The Lanczos filter takes as much space in the source image in
+        // each direction as the size of the window = 2 for Lanczos2.
+        return 2.0f;
+      case ImageOperations::RESIZE_LANCZOS3:
+        // The Lanczos filter takes as much space in the source image in
+        // each direction as the size of the window = 3 for Lanczos3.
+        return 3.0f;
+      default:
+        return 1.0f;
+    }
+  }
+
+  // Computes one set of filters either horizontally or vertically. The caller
+  // will specify the "min" and "max" rather than the bottom/top and
+  // right/bottom so that the same code can be re-used in each dimension.
+  //
+  // |src_depend_lo| and |src_depend_size| gives the range for the source
+  // depend rectangle (horizontally or vertically at the caller's discretion
+  // -- see above for what this means).
+  //
+  // Likewise, the range of destination values to compute and the scale factor
+  // for the transform is also specified.
+  void ComputeFilters(int src_size,
+                      int dest_subset_lo, int dest_subset_size,
+                      float scale, float src_support,
+                      ConvolutionFilter1D* output);
+
+  // Computes the filter value given the coordinate in filter space.
+  inline float ComputeFilter(float pos) {
+    switch (method_) {
+      case ImageOperations::RESIZE_BOX:
+        return EvalBox(pos);
+      case ImageOperations::RESIZE_HAMMING1:
+        return EvalHamming(1, pos);
+      case ImageOperations::RESIZE_LANCZOS2:
+        return EvalLanczos(2, pos);
+      case ImageOperations::RESIZE_LANCZOS3:
+        return EvalLanczos(3, pos);
+      default:
+        return 0;
+    }
+  }
+
+  ImageOperations::ResizeMethod method_;
+
+  // Size of the filter support on one side only in the destination space.
+  // See GetFilterSupport.
+  float x_filter_support_;
+  float y_filter_support_;
+
+  // Subset of scaled destination bitmap to compute.
+  SkIRect out_bounds_;
+
+  ConvolutionFilter1D x_filter_;
+  ConvolutionFilter1D y_filter_;
+
+  DISALLOW_COPY_AND_ASSIGN(ResizeFilter);
+};
+
+ResizeFilter::ResizeFilter(ImageOperations::ResizeMethod method,
+                           int src_full_width, int src_full_height,
+                           int dest_width, int dest_height,
+                           const SkIRect& dest_subset)
+    : method_(method),
+      out_bounds_(dest_subset) {
+  // method_ will only ever refer to an "algorithm method".
+  SkASSERT((ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+           (method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD));
+
+  float scale_x = static_cast<float>(dest_width) /
+                  static_cast<float>(src_full_width);
+  float scale_y = static_cast<float>(dest_height) /
+                  static_cast<float>(src_full_height);
+
+  x_filter_support_ = GetFilterSupport(scale_x);
+  y_filter_support_ = GetFilterSupport(scale_y);
+
+  // Support of the filter in source space.
+  float src_x_support = x_filter_support_ / scale_x;
+  float src_y_support = y_filter_support_ / scale_y;
+
+  ComputeFilters(src_full_width, dest_subset.fLeft, dest_subset.width(),
+                 scale_x, src_x_support, &x_filter_);
+  ComputeFilters(src_full_height, dest_subset.fTop, dest_subset.height(),
+                 scale_y, src_y_support, &y_filter_);
+}
+
+// TODO(egouriou): Take advantage of periods in the convolution.
+// Practical resizing filters are periodic outside of the border area.
+// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
+// source become p pixels in the destination) will have a period of p.
+// A nice consequence is a period of 1 when downscaling by an integral
+// factor. Downscaling from typical display resolutions is also bound
+// to produce interesting periods as those are chosen to have multiple
+// small factors.
+// Small periods reduce computational load and improve cache usage if
+// the coefficients can be shared. For periods of 1 we can consider
+// loading the factors only once outside the borders.
+void ResizeFilter::ComputeFilters(int src_size,
+                                  int dest_subset_lo, int dest_subset_size,
+                                  float scale, float src_support,
+                                  ConvolutionFilter1D* output) {
+  int dest_subset_hi = dest_subset_lo + dest_subset_size;  // [lo, hi)
+
+  // When we're doing a magnification, the scale will be larger than one. This
+  // means the destination pixels are much smaller than the source pixels, and
+  // that the range covered by the filter won't necessarily cover any source
+  // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+  // some computations.
+  float clamped_scale = NS_MIN(1.0f, scale);
+
+  // Speed up the divisions below by turning them into multiplies.
+  float inv_scale = 1.0f / scale;
+
+  StackVector<float, 64> filter_values;
+  StackVector<int16_t, 64> fixed_filter_values;
+
+  // Loop over all pixels in the output range. We will generate one set of
+  // filter values for each one. Those values will tell us how to blend the
+  // source pixels to compute the destination pixel.
+  for (int dest_subset_i = dest_subset_lo; dest_subset_i < dest_subset_hi;
+       dest_subset_i++) {
+    // Reset the arrays. We don't declare them inside so they can re-use the
+    // same malloc-ed buffer.
+    filter_values->clear();
+    fixed_filter_values->clear();
+
+    // This is the pixel in the source directly under the pixel in the dest.
+    // Note that we base computations on the "center" of the pixels. To see
+    // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
+    // downscale should "cover" the pixels around the pixel with *its center*
+    // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
+    // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
+    // TODO(evannier): this code is therefore incorrect and should read:
+    // float src_pixel = (static_cast<float>(dest_subset_i) + 0.5f) * inv_scale;
+    // I leave it incorrect, because changing it would require modifying
+    // the results for the webkit test, which I will do in a subsequent checkin.
+    float src_pixel = dest_subset_i * inv_scale;
+
+    // Compute the (inclusive) range of source pixels the filter covers.
+    int src_begin = NS_MAX(0, FloorInt(src_pixel - src_support));
+    int src_end = NS_MIN(src_size - 1, CeilInt(src_pixel + src_support));
+
+    // Compute the unnormalized filter value at each location of the source
+    // it covers.
+    float filter_sum = 0.0f;  // Sub of the filter values for normalizing.
+    for (int cur_filter_pixel = src_begin; cur_filter_pixel <= src_end;
+         cur_filter_pixel++) {
+      // Distance from the center of the filter, this is the filter coordinate
+      // in source space. We also need to consider the center of the pixel
+      // when comparing distance against 'src_pixel'. In the 5x downscale
+      // example used above the distance from the center of the filter to
+      // the pixel with coordinates (2, 2) should be 0, because its center
+      // is at (2.5, 2.5).
+      // TODO(evannier): as above (in regards to the 0.5 pixel error),
+      // this code is incorrect, but is left it for the same reasons.
+      // float src_filter_dist =
+      //     ((static_cast<float>(cur_filter_pixel) + 0.5f) - src_pixel);
+      float src_filter_dist = cur_filter_pixel - src_pixel;
+
+      // Since the filter really exists in dest space, map it there.
+      float dest_filter_dist = src_filter_dist * clamped_scale;
+
+      // Compute the filter value at that location.
+      float filter_value = ComputeFilter(dest_filter_dist);
+      filter_values->push_back(filter_value);
+
+      filter_sum += filter_value;
+    }
+
+    // The filter must be normalized so that we don't affect the brightness of
+    // the image. Convert to normalized fixed point.
+    int16_t fixed_sum = 0;
+    for (size_t i = 0; i < filter_values->size(); i++) {
+      int16_t cur_fixed = output->FloatToFixed(filter_values[i] / filter_sum);
+      fixed_sum += cur_fixed;
+      fixed_filter_values->push_back(cur_fixed);
+    }
+
+    // The conversion to fixed point will leave some rounding errors, which
+    // we add back in to avoid affecting the brightness of the image. We
+    // arbitrarily add this to the center of the filter array (this won't always
+    // be the center of the filter function since it could get clipped on the
+    // edges, but it doesn't matter enough to worry about that case).
+    int16_t leftovers = output->FloatToFixed(1.0f) - fixed_sum;
+    fixed_filter_values[fixed_filter_values->size() / 2] += leftovers;
+
+    // Now it's ready to go.
+    output->AddFilter(src_begin, &fixed_filter_values[0],
+                      static_cast<int>(fixed_filter_values->size()));
+  }
+
+  output->PaddingForSIMD(8);
+}
+
+ImageOperations::ResizeMethod ResizeMethodToAlgorithmMethod(
+    ImageOperations::ResizeMethod method) {
+  // Convert any "Quality Method" into an "Algorithm Method"
+  if (method >= ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD &&
+      method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD) {
+    return method;
+  }
+  // The call to ImageOperationsGtv::Resize() above took care of
+  // GPU-acceleration in the cases where it is possible. So now we just
+  // pick the appropriate software method for each resize quality.
+  switch (method) {
+    // Users of RESIZE_GOOD are willing to trade a lot of quality to
+    // get speed, allowing the use of linear resampling to get hardware
+    // acceleration (SRB). Hence any of our "good" software filters
+    // will be acceptable, and we use the fastest one, Hamming-1.
+    case ImageOperations::RESIZE_GOOD:
+      // Users of RESIZE_BETTER are willing to trade some quality in order
+      // to improve performance, but are guaranteed not to devolve to a linear
+      // resampling. In visual tests we see that Hamming-1 is not as good as
+      // Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
+      // about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
+      // an acceptable trade-off between quality and speed.
+    case ImageOperations::RESIZE_BETTER:
+      return ImageOperations::RESIZE_HAMMING1;
+    default:
+      return ImageOperations::RESIZE_LANCZOS3;
+  }
+}
+
+}  // namespace
+
+// Resize ----------------------------------------------------------------------
+
+// static
+SkBitmap ImageOperations::Resize(const SkBitmap& source,
+                                 ResizeMethod method,
+                                 int dest_width, int dest_height,
+                                 const SkIRect& dest_subset,
+                                 void* dest_pixels /* = nullptr */) {
+  if (method == ImageOperations::RESIZE_SUBPIXEL)
+    return ResizeSubpixel(source, dest_width, dest_height, dest_subset);
+  else
+    return ResizeBasic(source, method, dest_width, dest_height, dest_subset,
+                       dest_pixels);
+}
+
+// static
+SkBitmap ImageOperations::ResizeSubpixel(const SkBitmap& source,
+                                         int dest_width, int dest_height,
+                                         const SkIRect& dest_subset) {
+  // Currently only works on Linux/BSD because these are the only platforms
+  // where SkFontHost::GetSubpixelOrder is defined.
+#if defined(XP_UNIX)
+  // Understand the display.
+  const SkFontHost::LCDOrder order = SkFontHost::GetSubpixelOrder();
+  const SkFontHost::LCDOrientation orientation =
+      SkFontHost::GetSubpixelOrientation();
+
+  // Decide on which dimension, if any, to deploy subpixel rendering.
+  int w = 1;
+  int h = 1;
+  switch (orientation) {
+    case SkFontHost::kHorizontal_LCDOrientation:
+      w = dest_width < source.width() ? 3 : 1;
+      break;
+    case SkFontHost::kVertical_LCDOrientation:
+      h = dest_height < source.height() ? 3 : 1;
+      break;
+  }
+
+  // Resize the image.
+  const int width = dest_width * w;
+  const int height = dest_height * h;
+  SkIRect subset = { dest_subset.fLeft, dest_subset.fTop,
+                     dest_subset.fLeft + dest_subset.width() * w,
+                     dest_subset.fTop + dest_subset.height() * h };
+  SkBitmap img = ResizeBasic(source, ImageOperations::RESIZE_LANCZOS3, width,
+                             height, subset);
+  const int row_words = img.rowBytes() / 4;
+  if (w == 1 && h == 1)
+    return img;
+
+  // Render into subpixels.
+  SkBitmap result;
+  result.setConfig(SkBitmap::kARGB_8888_Config, dest_subset.width(),
+                   dest_subset.height());
+  result.allocPixels();
+  if (!result.readyToDraw())
+    return img;
+
+  SkAutoLockPixels locker(img);
+  if (!img.readyToDraw())
+    return img;
+
+  uint32_t* src_row = img.getAddr32(0, 0);
+  uint32_t* dst_row = result.getAddr32(0, 0);
+  for (int y = 0; y < dest_subset.height(); y++) {
+    uint32_t* src = src_row;
+    uint32_t* dst = dst_row;
+    for (int x = 0; x < dest_subset.width(); x++, src += w, dst++) {
+      uint8_t r = 0, g = 0, b = 0, a = 0;
+      switch (order) {
+        case SkFontHost::kRGB_LCDOrder:
+          switch (orientation) {
+            case SkFontHost::kHorizontal_LCDOrientation:
+              r = SkGetPackedR32(src[0]);
+              g = SkGetPackedG32(src[1]);
+              b = SkGetPackedB32(src[2]);
+              a = SkGetPackedA32(src[1]);
+              break;
+            case SkFontHost::kVertical_LCDOrientation:
+              r = SkGetPackedR32(src[0 * row_words]);
+              g = SkGetPackedG32(src[1 * row_words]);
+              b = SkGetPackedB32(src[2 * row_words]);
+              a = SkGetPackedA32(src[1 * row_words]);
+              break;
+          }
+          break;
+        case SkFontHost::kBGR_LCDOrder:
+          switch (orientation) {
+            case SkFontHost::kHorizontal_LCDOrientation:
+              b = SkGetPackedB32(src[0]);
+              g = SkGetPackedG32(src[1]);
+              r = SkGetPackedR32(src[2]);
+              a = SkGetPackedA32(src[1]);
+              break;
+            case SkFontHost::kVertical_LCDOrientation:
+              b = SkGetPackedB32(src[0 * row_words]);
+              g = SkGetPackedG32(src[1 * row_words]);
+              r = SkGetPackedR32(src[2 * row_words]);
+              a = SkGetPackedA32(src[1 * row_words]);
+              break;
+          }
+          break;
+        case SkFontHost::kNONE_LCDOrder:
+          break;
+      }
+      // Premultiplied alpha is very fragile.
+      a = a > r ? a : r;
+      a = a > g ? a : g;
+      a = a > b ? a : b;
+      *dst = SkPackARGB32(a, r, g, b);
+    }
+    src_row += h * row_words;
+    dst_row += result.rowBytes() / 4;
+  }
+  result.setIsOpaque(img.isOpaque());
+  return result;
+#else
+  return SkBitmap();
+#endif  // OS_POSIX && !OS_MACOSX && !defined(OS_ANDROID)
+}
+
+// static
+SkBitmap ImageOperations::ResizeBasic(const SkBitmap& source,
+                                      ResizeMethod method,
+                                      int dest_width, int dest_height,
+                                      const SkIRect& dest_subset,
+                                      void* dest_pixels /* = nullptr */) {
+  // Ensure that the ResizeMethod enumeration is sound.
+  SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
+            (method <= RESIZE_LAST_QUALITY_METHOD)) ||
+           ((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+            (method <= RESIZE_LAST_ALGORITHM_METHOD)));
+
+  // If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
+  // return empty.
+  if (source.width() < 1 || source.height() < 1 ||
+      dest_width < 1 || dest_height < 1)
+    return SkBitmap();
+
+  method = ResizeMethodToAlgorithmMethod(method);
+  // Check that we deal with an "algorithm methods" from this point onward.
+  SkASSERT((ImageOperations::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+           (method <= ImageOperations::RESIZE_LAST_ALGORITHM_METHOD));
+
+  SkAutoLockPixels locker(source);
+  if (!source.readyToDraw())
+      return SkBitmap();
+
+  ResizeFilter filter(method, source.width(), source.height(),
+                      dest_width, dest_height, dest_subset);
+
+  // Get a source bitmap encompassing this touched area. We construct the
+  // offsets and row strides such that it looks like a new bitmap, while
+  // referring to the old data.
+  const uint8_t* source_subset =
+      reinterpret_cast<const uint8_t*>(source.getPixels());
+
+  // Convolve into the result.
+  SkBitmap result;
+  result.setConfig(SkBitmap::kARGB_8888_Config,
+                   dest_subset.width(), dest_subset.height());
+
+  if (dest_pixels) {
+    result.setPixels(dest_pixels);
+  } else {
+    result.allocPixels();
+  }
+
+  if (!result.readyToDraw())
+    return SkBitmap();
+
+  BGRAConvolve2D(source_subset, static_cast<int>(source.rowBytes()),
+                 !source.isOpaque(), filter.x_filter(), filter.y_filter(),
+                 static_cast<int>(result.rowBytes()),
+                 static_cast<unsigned char*>(result.getPixels()),
+                 /* sse = */ false);
+
+  // Preserve the "opaque" flag for use as an optimization later.
+  result.setIsOpaque(source.isOpaque());
+
+  return result;
+}
+
+// static
+SkBitmap ImageOperations::Resize(const SkBitmap& source,
+                                 ResizeMethod method,
+                                 int dest_width, int dest_height,
+                                 void* dest_pixels /* = nullptr */) {
+  SkIRect dest_subset = { 0, 0, dest_width, dest_height };
+  return Resize(source, method, dest_width, dest_height, dest_subset,
+                dest_pixels);
+}
+
+}  // namespace skia
new file mode 100644
--- /dev/null
+++ b/gfx/2d/image_operations.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SKIA_EXT_IMAGE_OPERATIONS_H_
+#define SKIA_EXT_IMAGE_OPERATIONS_H_
+
+#include "skia/SkTypes.h"
+#include "Types.h"
+
+class SkBitmap;
+struct SkIRect;
+
+namespace skia {
+
+class ImageOperations {
+ public:
+  enum ResizeMethod {
+    //
+    // Quality Methods
+    //
+    // Those enumeration values express a desired quality/speed tradeoff.
+    // They are translated into an algorithm-specific method that depends
+    // on the capabilities (CPU, GPU) of the underlying platform.
+    // It is possible for all three methods to be mapped to the same
+    // algorithm on a given platform.
+
+    // Good quality resizing. Fastest resizing with acceptable visual quality.
+    // This is typically intended for use during interactive layouts
+    // where slower platforms may want to trade image quality for large
+    // increase in resizing performance.
+    //
+    // For example the resizing implementation may devolve to linear
+    // filtering if this enables GPU acceleration to be used.
+    //
+    // Note that the underlying resizing method may be determined
+    // on the fly based on the parameters for a given resize call.
+    // For example an implementation using a GPU-based linear filter
+    // in the common case may still use a higher-quality software-based
+    // filter in cases where using the GPU would actually be slower - due
+    // to too much latency - or impossible - due to image format or size
+    // constraints.
+    RESIZE_GOOD,
+
+    // Medium quality resizing. Close to high quality resizing (better
+    // than linear interpolation) with potentially some quality being
+    // traded-off for additional speed compared to RESIZE_BEST.
+    //
+    // This is intended, for example, for generation of large thumbnails
+    // (hundreds of pixels in each dimension) from large sources, where
+    // a linear filter would produce too many artifacts but where
+    // a RESIZE_HIGH might be too costly time-wise.
+    RESIZE_BETTER,
+
+    // High quality resizing. The algorithm is picked to favor image quality.
+    RESIZE_BEST,
+
+    //
+    // Algorithm-specific enumerations
+    //
+
+    // Box filter. This is a weighted average of all of the pixels touching
+    // the destination pixel. For enlargement, this is nearest neighbor.
+    //
+    // You probably don't want this, it is here for testing since it is easy to
+    // compute. Use RESIZE_LANCZOS3 instead.
+    RESIZE_BOX,
+
+    // 1-cycle Hamming filter. This is tall is the middle and falls off towards
+    // the window edges but without going to 0. This is about 40% faster than
+    // a 2-cycle Lanczos.
+    RESIZE_HAMMING1,
+
+    // 2-cycle Lanczos filter. This is tall in the middle, goes negative on
+    // each side, then returns to zero. Does not provide as good a frequency
+    // response as a 3-cycle Lanczos but is roughly 30% faster.
+    RESIZE_LANCZOS2,
+
+    // 3-cycle Lanczos filter. This is tall in the middle, goes negative on
+    // each side, then oscillates 2 more times. It gives nice sharp edges.
+    RESIZE_LANCZOS3,
+
+    // Lanczos filter + subpixel interpolation. If subpixel rendering is not
+    // appropriate we automatically fall back to Lanczos.
+    RESIZE_SUBPIXEL,
+
+    // enum aliases for first and last methods by algorithm or by quality.
+    RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
+    RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
+    RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
+    RESIZE_LAST_ALGORITHM_METHOD = RESIZE_SUBPIXEL,
+  };
+
+  // Resizes the given source bitmap using the specified resize method, so that
+  // the entire image is (dest_size) big. The dest_subset is the rectangle in
+  // this destination image that should actually be returned.
+  //
+  // The output image will be (dest_subset.width(), dest_subset.height()). This
+  // will save work if you do not need the entire bitmap.
+  //
+  // The destination subset must be smaller than the destination image.
+  static SkBitmap Resize(const SkBitmap& source,
+                         ResizeMethod method,
+                         int dest_width, int dest_height,
+                         const SkIRect& dest_subset,
+                         void* dest_pixels = nullptr);
+
+  // Alternate version for resizing and returning the entire bitmap rather than
+  // a subset.
+  static SkBitmap Resize(const SkBitmap& source,
+                         ResizeMethod method,
+                         int dest_width, int dest_height,
+                         void* dest_pixels = nullptr);
+
+ private:
+  ImageOperations();  // Class for scoping only.
+
+  // Supports all methods except RESIZE_SUBPIXEL.
+  static SkBitmap ResizeBasic(const SkBitmap& source,
+                              ResizeMethod method,
+                              int dest_width, int dest_height,
+                              const SkIRect& dest_subset,
+                              void* dest_pixels = nullptr);
+
+  // Subpixel renderer.
+  static SkBitmap ResizeSubpixel(const SkBitmap& source,
+                                 int dest_width, int dest_height,
+                                 const SkIRect& dest_subset);
+};
+
+}  // namespace skia
+
+#endif  // SKIA_EXT_IMAGE_OPERATIONS_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/port.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_PORT_H_
+#define BASE_PORT_H_
+
+#include <stdarg.h>
+#include "build/build_config.h"
+
+#ifdef COMPILER_MSVC
+#define GG_LONGLONG(x) x##I64
+#define GG_ULONGLONG(x) x##UI64
+#else
+#define GG_LONGLONG(x) x##LL
+#define GG_ULONGLONG(x) x##ULL
+#endif
+
+// Per C99 7.8.14, define __STDC_CONSTANT_MACROS before including <stdint.h>
+// to get the INTn_C and UINTn_C macros for integer constants.  It's difficult
+// to guarantee any specific ordering of header includes, so it's difficult to
+// guarantee that the INTn_C macros can be defined by including <stdint.h> at
+// any specific point.  Provide GG_INTn_C macros instead.
+
+#define GG_INT8_C(x)    (x)
+#define GG_INT16_C(x)   (x)
+#define GG_INT32_C(x)   (x)
+#define GG_INT64_C(x)   GG_LONGLONG(x)
+
+#define GG_UINT8_C(x)   (x ## U)
+#define GG_UINT16_C(x)  (x ## U)
+#define GG_UINT32_C(x)  (x ## U)
+#define GG_UINT64_C(x)  GG_ULONGLONG(x)
+
+namespace base {
+
+// It's possible for functions that use a va_list, such as StringPrintf, to
+// invalidate the data in it upon use.  The fix is to make a copy of the
+// structure before using it and use that copy instead.  va_copy is provided
+// for this purpose.  MSVC does not provide va_copy, so define an
+// implementation here.  It is not guaranteed that assignment is a copy, so the
+// StringUtil.VariableArgsFunc unit test tests this capability.
+
+// The C standard says that va_copy is a "macro", not a function.  Trying to 
+// use va_list as ref args to a function, as above, breaks some machines.
+#  if defined(COMPILER_GCC)
+#    define base_va_copy(_a, _b) ::va_copy(_a, _b)
+#  elif defined(COMPILER_MSVC)
+#    define base_va_copy(_a, _b) (_a = _b)
+#  else
+#    error No va_copy for your compiler
+#  endif
+
+}  // namespace base
+
+// Define an OS-neutral wrapper for shared library entry points
+#if defined(OS_WIN)
+#define API_CALL __stdcall
+#elif defined(OS_LINUX) || defined(OS_MACOSX)
+#define API_CALL
+#endif
+
+#endif  // BASE_PORT_H_
new file mode 100644
--- /dev/null
+++ b/gfx/2d/stack_container.h
@@ -0,0 +1,253 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STACK_CONTAINER_H_
+#define BASE_STACK_CONTAINER_H_
+
+#include <string>
+#include <vector>
+
+#include "basictypes.h"
+
+// This allocator can be used with STL containers to provide a stack buffer
+// from which to allocate memory and overflows onto the heap. This stack buffer
+// would be allocated on the stack and allows us to avoid heap operations in
+// some situations.
+//
+// STL likes to make copies of allocators, so the allocator itself can't hold
+// the data. Instead, we make the creator responsible for creating a
+// StackAllocator::Source which contains the data. Copying the allocator
+// merely copies the pointer to this shared source, so all allocators created
+// based on our allocator will share the same stack buffer.
+//
+// This stack buffer implementation is very simple. The first allocation that
+// fits in the stack buffer will use the stack buffer. Any subsequent
+// allocations will not use the stack buffer, even if there is unused room.
+// This makes it appropriate for array-like containers, but the caller should
+// be sure to reserve() in the container up to the stack buffer size. Otherwise
+// the container will allocate a small array which will "use up" the stack
+// buffer.
+template<typename T, size_t stack_capacity>
+class StackAllocator : public std::allocator<T> {
+ public:
+  typedef typename std::allocator<T>::pointer pointer;
+  typedef typename std::allocator<T>::size_type size_type;
+
+  // Backing store for the allocator. The container owner is responsible for
+  // maintaining this for as long as any containers using this allocator are
+  // live.
+  struct Source {
+    Source() : used_stack_buffer_(false) {
+    }
+
+    // Casts the buffer in its right type.
+    T* stack_buffer() { return reinterpret_cast<T*>(stack_buffer_); }
+    const T* stack_buffer() const {
+      return reinterpret_cast<const T*>(stack_buffer_);
+    }
+
+    //
+    // IMPORTANT: Take care to ensure that stack_buffer_ is aligned
+    // since it is used to mimic an array of T.
+    // Be careful while declaring any unaligned types (like bool)
+    // before stack_buffer_.
+    //
+
+    // The buffer itself. It is not of type T because we don't want the
+    // constructors and destructors to be automatically called. Define a POD
+    // buffer of the right size instead.
+    char stack_buffer_[sizeof(T[stack_capacity])];
+
+    // Set when the stack buffer is used for an allocation. We do not track
+    // how much of the buffer is used, only that somebody is using it.
+    bool used_stack_buffer_;
+  };
+
+  // Used by containers when they want to refer to an allocator of type U.
+  template<typename U>
+  struct rebind {
+    typedef StackAllocator<U, stack_capacity> other;
+  };
+
+  // For the straight up copy c-tor, we can share storage.
+  StackAllocator(const StackAllocator<T, stack_capacity>& rhs)
+      : source_(rhs.source_) {
+  }
+
+  // ISO C++ requires the following constructor to be defined,
+  // and std::vector in VC++2008SP1 Release fails with an error
+  // in the class _Container_base_aux_alloc_real (from <xutility>)
+  // if the constructor does not exist.
+  // For this constructor, we cannot share storage; there's
+  // no guarantee that the Source buffer of Ts is large enough
+  // for Us.
+  // TODO: If we were fancy pants, perhaps we could share storage
+  // iff sizeof(T) == sizeof(U).
+  template<typename U, size_t other_capacity>
+  StackAllocator(const StackAllocator<U, other_capacity>& other)
+      : source_(NULL) {
+  }
+
+  explicit StackAllocator(Source* source) : source_(source) {
+  }
+
+  // Actually do the allocation. Use the stack buffer if nobody has used it yet
+  // and the size requested fits. Otherwise, fall through to the standard
+  // allocator.
+  pointer allocate(size_type n, void* hint = 0) {
+    if (source_ != NULL && !source_->used_stack_buffer_
+        && n <= stack_capacity) {
+      source_->used_stack_buffer_ = true;
+      return source_->stack_buffer();
+    } else {
+      return std::allocator<T>::allocate(n, hint);
+    }
+  }
+
+  // Free: when trying to free the stack buffer, just mark it as free. For
+  // non-stack-buffer pointers, just fall though to the standard allocator.
+  void deallocate(pointer p, size_type n) {
+    if (source_ != NULL && p == source_->stack_buffer())
+      source_->used_stack_buffer_ = false;
+    else
+      std::allocator<T>::deallocate(p, n);
+  }
+
+ private:
+  Source* source_;
+};
+
+// A wrapper around STL containers that maintains a stack-sized buffer that the
+// initial capacity of the vector is based on. Growing the container beyond the
+// stack capacity will transparently overflow onto the heap. The container must
+// support reserve().
+//
+// WATCH OUT: the ContainerType MUST use the proper StackAllocator for this
+// type. This object is really intended to be used only internally. You'll want
+// to use the wrappers below for different types.
+template<typename TContainerType, int stack_capacity>
+class StackContainer {
+ public:
+  typedef TContainerType ContainerType;
+  typedef typename ContainerType::value_type ContainedType;
+  typedef StackAllocator<ContainedType, stack_capacity> Allocator;
+
+  // Allocator must be constructed before the container!
+  StackContainer() : allocator_(&stack_data_), container_(allocator_) {
+    // Make the container use the stack allocation by reserving our buffer size
+    // before doing anything else.
+    container_.reserve(stack_capacity);
+  }
+
+  // Getters for the actual container.
+  //
+  // Danger: any copies of this made using the copy constructor must have
+  // shorter lifetimes than the source. The copy will share the same allocator
+  // and therefore the same stack buffer as the original. Use std::copy to
+  // copy into a "real" container for longer-lived objects.
+  ContainerType& container() { return container_; }
+  const ContainerType& container() const { return container_; }
+
+  // Support operator-> to get to the container. This allows nicer syntax like:
+  //   StackContainer<...> foo;
+  //   std::sort(foo->begin(), foo->end());
+  ContainerType* operator->() { return &container_; }
+  const ContainerType* operator->() const { return &container_; }
+
+#ifdef UNIT_TEST
+  // Retrieves the stack source so that that unit tests can verify that the
+  // buffer is being used properly.
+  const typename Allocator::Source& stack_data() const {
+    return stack_data_;
+  }
+#endif
+
+ protected:
+  typename Allocator::Source stack_data_;
+  Allocator allocator_;
+  ContainerType container_;
+
+  DISALLOW_EVIL_CONSTRUCTORS(StackContainer);
+};
+
+// StackString
+template<size_t stack_capacity>
+class StackString : public StackContainer<
+    std::basic_string<char,
+                      std::char_traits<char>,
+                      StackAllocator<char, stack_capacity> >,
+    stack_capacity> {
+ public:
+  StackString() : StackContainer<
+      std::basic_string<char,
+                        std::char_traits<char>,
+                        StackAllocator<char, stack_capacity> >,
+      stack_capacity>() {
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(StackString);
+};
+
+// StackWString
+template<size_t stack_capacity>
+class StackWString : public StackContainer<
+    std::basic_string<wchar_t,
+                      std::char_traits<wchar_t>,
+                      StackAllocator<wchar_t, stack_capacity> >,
+    stack_capacity> {
+ public:
+  StackWString() : StackContainer<
+      std::basic_string<wchar_t,
+                        std::char_traits<wchar_t>,
+                        StackAllocator<wchar_t, stack_capacity> >,
+      stack_capacity>() {
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(StackWString);
+};
+
+// StackVector
+//
+// Example:
+//   StackVector<int, 16> foo;
+//   foo->push_back(22);  // we have overloaded operator->
+//   foo[0] = 10;         // as well as operator[]
+template<typename T, size_t stack_capacity>
+class StackVector : public StackContainer<
+    std::vector<T, StackAllocator<T, stack_capacity> >,
+    stack_capacity> {
+ public:
+  StackVector() : StackContainer<
+      std::vector<T, StackAllocator<T, stack_capacity> >,
+      stack_capacity>() {
+  }
+
+  // We need to put this in STL containers sometimes, which requires a copy
+  // constructor. We can't call the regular copy constructor because that will
+  // take the stack buffer from the original. Here, we create an empty object
+  // and make a stack buffer of its own.
+  StackVector(const StackVector<T, stack_capacity>& other)
+      : StackContainer<
+            std::vector<T, StackAllocator<T, stack_capacity> >,
+            stack_capacity>() {
+    this->container().assign(other->begin(), other->end());
+  }
+
+  StackVector<T, stack_capacity>& operator=(
+      const StackVector<T, stack_capacity>& other) {
+    this->container().assign(other->begin(), other->end());
+    return *this;
+  }
+
+  // Vectors are commonly indexed, which isn't very convenient even with
+  // operator-> (using "->at()" does exception stuff we don't want).
+  T& operator[](size_t i) { return this->container().operator[](i); }
+  const T& operator[](size_t i) const {
+    return this->container().operator[](i);
+  }
+};
+
+#endif  // BASE_STACK_CONTAINER_H_