Bug 577843 - Scale videos at YCbCr conversion time - r=roc a=blocking2.0
authorChris Double <chris.double@double.co.nz>
Tue, 26 Oct 2010 16:11:13 +1300
changeset 57389 8ecd9dc6684e04dd6a3d37b0bc8f40c5847e4e7a
parent 57388 59bbc730aee4b5bfb49fabac9633efadeb72c101
child 57390 64901a1fcf9339c7497c0f203d9df1f95df3c738
push idunknown
push userunknown
push dateunknown
reviewersroc, blocking2
bugs577843
milestone2.0b8pre
Bug 577843 - Scale videos at YCbCr conversion time - r=roc a=blocking2.0
content/media/nsMediaDecoder.cpp
gfx/layers/ImageLayers.h
gfx/layers/basic/BasicImages.cpp
gfx/ycbcr/README
gfx/ycbcr/add_scale.patch
gfx/ycbcr/update.sh
gfx/ycbcr/yuv_convert.cpp
gfx/ycbcr/yuv_convert.h
gfx/ycbcr/yuv_row.h
gfx/ycbcr/yuv_row_c.cpp
gfx/ycbcr/yuv_row_linux.cpp
gfx/ycbcr/yuv_row_mac.cpp
gfx/ycbcr/yuv_row_win.cpp
layout/generic/nsVideoFrame.cpp
--- a/content/media/nsMediaDecoder.cpp
+++ b/content/media/nsMediaDecoder.cpp
@@ -45,28 +45,23 @@
 #include "nsIDocument.h"
 #include "nsThreadUtils.h"
 #include "nsIDOMHTMLMediaElement.h"
 #include "nsNetUtil.h"
 #include "nsHTMLMediaElement.h"
 #include "nsAutoLock.h"
 #include "nsIRenderingContext.h"
 #include "gfxContext.h"
-#include "gfxImageSurface.h"
 #include "nsPresContext.h"
 #include "nsDOMError.h"
 #include "nsDisplayList.h"
 #ifdef MOZ_SVG
 #include "nsSVGEffects.h"
 #endif
 
-#if defined(XP_MACOSX)
-#include "gfxQuartzImageSurface.h"
-#endif
-
 // Number of milliseconds between progress events as defined by spec
 #define PROGRESS_MS 350
 
 // Number of milliseconds of no data before a stall event is fired as defined by spec
 #define STALL_MS 3000
 
 // Number of milliseconds between timeupdate events as defined by spec
 #define TIMEUPDATE_MS 250
--- a/gfx/layers/ImageLayers.h
+++ b/gfx/layers/ImageLayers.h
@@ -109,16 +109,17 @@ protected:
  * (because layers can only be used on the main thread) and we want to
  * be able to set the current Image from any thread, to facilitate
  * video playback without involving the main thread, for example.
  */
 class THEBES_API ImageContainer {
   THEBES_INLINE_DECL_THREADSAFE_REFCOUNTING(ImageContainer)
 
 public:
+  ImageContainer() {}
   virtual ~ImageContainer() {}
 
   /**
    * Create an Image in one of the given formats.
    * Picks the "best" format from the list and creates an Image of that
    * format.
    * Returns null if this backend does not support any of the formats.
    */
@@ -174,16 +175,23 @@ public:
 
   /**
    * Set a new layer manager for this image container.  It must be
    * either of the same type as the container's current layer manager,
    * or null.  TRUE is returned on success.
    */
   virtual PRBool SetLayerManager(LayerManager *aManager) = 0;
 
+  /**
+   * Sets a size that the image is expected to be rendered at.
+   * This is a hint for image backends to optimize scaling.
+   * Default implementation in this class is to ignore the hint.
+   */
+  virtual void SetScaleHint(const gfxIntSize& /* aScaleHint */) { }
+
 protected:
   LayerManager* mManager;
 
   ImageContainer(LayerManager* aManager) : mManager(aManager) {}
 };
 
 /**
  * A Layer which renders an Image.
--- a/gfx/layers/basic/BasicImages.cpp
+++ b/gfx/layers/basic/BasicImages.cpp
@@ -99,39 +99,50 @@ protected:
 
 /**
  * We handle YCbCr by converting to RGB when the image is initialized
  * (which should be done off the main thread). The RGB results are stored
  * in a memory buffer and converted to a cairo surface lazily.
  */
 class BasicPlanarYCbCrImage : public PlanarYCbCrImage, public BasicImageImplData {
 public:
-  BasicPlanarYCbCrImage() :
-    PlanarYCbCrImage(static_cast<BasicImageImplData*>(this))
+   /** 
+    * aScaleHint is a size that the image is expected to be rendered at.
+    * This is a hint for image backends to optimize scaling.
+    */
+  BasicPlanarYCbCrImage(const gfxIntSize& aScaleHint) :
+    PlanarYCbCrImage(static_cast<BasicImageImplData*>(this)),
+    mScaleHint(aScaleHint)
     {}
 
   virtual void SetData(const Data& aData);
 
   virtual already_AddRefed<gfxASurface> GetAsSurface();
 
 protected:
   nsAutoArrayPtr<PRUint8>              mBuffer;
   nsCountedRef<nsMainThreadSurfaceRef> mSurface;
+  gfxIntSize                           mScaleHint;
 };
 
 void
 BasicPlanarYCbCrImage::SetData(const Data& aData)
 {
   // Do some sanity checks to prevent integer overflow
   if (aData.mYSize.width > 16384 || aData.mYSize.height > 16384) {
     NS_ERROR("Illegal width or height");
     return;
   }
-  size_t size = aData.mPicSize.width*aData.mPicSize.height*4;
-  mBuffer = new PRUint8[size];
+  // 'prescale' is true if the scaling is to be done as part of the
+  // YCbCr to RGB conversion rather than on the RGB data when rendered.
+  PRBool prescale = mScaleHint.width > 0 && mScaleHint.height > 0;
+  gfxIntSize size(prescale ? mScaleHint.width : aData.mPicSize.width,
+                  prescale ? mScaleHint.height : aData.mPicSize.height);
+
+  mBuffer = new PRUint8[size.width * size.height * 4];
   if (!mBuffer) {
     // out of memory
     return;
   }
 
   gfx::YUVType type = gfx::YV12;
   if (aData.mYSize.width == aData.mCbCrSize.width &&
       aData.mYSize.height == aData.mCbCrSize.height) {
@@ -144,30 +155,47 @@ BasicPlanarYCbCrImage::SetData(const Dat
   else if (aData.mYSize.width / 2 == aData.mCbCrSize.width &&
            aData.mYSize.height / 2 == aData.mCbCrSize.height ) {
     type = gfx::YV12;
   }
   else {
     NS_ERROR("YCbCr format not supported");
   }
  
-  // Convert from YCbCr to RGB now
-  gfx::ConvertYCbCrToRGB32(aData.mYChannel,
+  // Convert from YCbCr to RGB now, scaling the image if needed.
+  if (size != aData.mPicSize) {
+    gfx::ScaleYCbCrToRGB32(aData.mYChannel,
                            aData.mCbChannel,
                            aData.mCrChannel,
                            mBuffer,
-                           aData.mPicX,
-                           aData.mPicY,
                            aData.mPicSize.width,
                            aData.mPicSize.height,
+                           size.width,
+                           size.height,
                            aData.mYStride,
                            aData.mCbCrStride,
-                           aData.mPicSize.width*4,
-                           type);                                                          
-  mSize = aData.mPicSize;
+                           size.width*4,
+                           type,
+                           gfx::ROTATE_0);
+  }
+  else {
+    gfx::ConvertYCbCrToRGB32(aData.mYChannel,
+                             aData.mCbChannel,
+                             aData.mCrChannel,
+                             mBuffer,
+                             aData.mPicX,
+                             aData.mPicY,
+                             aData.mPicSize.width,
+                             aData.mPicSize.height,
+                             aData.mYStride,
+                             aData.mCbCrStride,
+                             aData.mPicSize.width*4,
+                             type);                                                          
+  }
+  mSize = size;
 }
 
 static cairo_user_data_key_t imageSurfaceDataKey;
 
 static void
 DestroyBuffer(void* aBuffer)
 {
   delete[] static_cast<PRUint8*>(aBuffer);
@@ -213,29 +241,32 @@ BasicPlanarYCbCrImage::GetAsSurface()
 /**
  * Our image container is very simple. It's really just a factory
  * for the image objects. We use a Monitor to synchronize access to
  * mImage.
  */
 class BasicImageContainer : public ImageContainer {
 public:
   BasicImageContainer(BasicLayerManager* aManager) :
-    ImageContainer(aManager), mMonitor("BasicImageContainer")
+    ImageContainer(aManager), mMonitor("BasicImageContainer"),
+    mScaleHint(-1, -1)
   {}
   virtual already_AddRefed<Image> CreateImage(const Image::Format* aFormats,
                                               PRUint32 aNumFormats);
   virtual void SetCurrentImage(Image* aImage);
   virtual already_AddRefed<Image> GetCurrentImage();
   virtual already_AddRefed<gfxASurface> GetCurrentAsSurface(gfxIntSize* aSize);
   virtual gfxIntSize GetCurrentSize();
   virtual PRBool SetLayerManager(LayerManager *aManager);
+  virtual void SetScaleHint(const gfxIntSize& aScaleHint);
 
 protected:
   Monitor mMonitor;
   nsRefPtr<Image> mImage;
+  gfxIntSize mScaleHint;
 };
 
 /**
  * Returns true if aFormat is in the given format array.
  */
 static PRBool
 FormatInList(const Image::Format* aFormats, PRUint32 aNumFormats,
              Image::Format aFormat)
@@ -252,17 +283,18 @@ already_AddRefed<Image>
 BasicImageContainer::CreateImage(const Image::Format* aFormats,
                                  PRUint32 aNumFormats)
 {
   nsRefPtr<Image> image;
   // Prefer cairo surfaces because they're native for us
   if (FormatInList(aFormats, aNumFormats, Image::CAIRO_SURFACE)) {
     image = new BasicCairoImage();
   } else if (FormatInList(aFormats, aNumFormats, Image::PLANAR_YCBCR)) {
-    image = new BasicPlanarYCbCrImage();
+    MonitorAutoEnter mon(mMonitor);
+    image = new BasicPlanarYCbCrImage(mScaleHint);
   }
   return image.forget();
 }
 
 void
 BasicImageContainer::SetCurrentImage(Image* aImage)
 {
   MonitorAutoEnter mon(mMonitor);
@@ -298,16 +330,22 @@ BasicImageContainer::GetCurrentAsSurface
 
 gfxIntSize
 BasicImageContainer::GetCurrentSize()
 {
   MonitorAutoEnter mon(mMonitor);
   return !mImage ? gfxIntSize(0,0) : ToImageData(mImage)->GetSize();
 }
 
+void BasicImageContainer::SetScaleHint(const gfxIntSize& aScaleHint)
+{
+  MonitorAutoEnter mon(mMonitor);
+  mScaleHint = aScaleHint;
+}
+
 PRBool
 BasicImageContainer::SetLayerManager(LayerManager *aManager)
 {
   if (aManager &&
       aManager->GetBackendType() != LayerManager::LAYERS_BASIC)
   {
     return PR_FALSE;
   }
--- a/gfx/ycbcr/README
+++ b/gfx/ycbcr/README
@@ -16,8 +16,9 @@ picture_region.patch: Change Chromium co
 
 remove_scale.patch: Removes Chromium scaling code.
 export.patch: Fix export for building on comm-central
 win64_mac64.patch: Fallback to C implementation on Windows and Mac OS X 64 bit
 yv24.patch: Adds YCbCr 4:4:4 support
 row_c_fix.patch: Fix broken C fallback code (See bug 561385).
 bug572034_mac_64bit.patch: Fix x86_64 linux code so it works on OS X.
 solaris.patch: Adds Solaris support, fallback to C implementation on SPARC
+add_scale.patch: re-adds Chromium scaling code
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/add_scale.patch
@@ -0,0 +1,953 @@
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+index 40ce10f..7d46629 100644
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -82,10 +82,139 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
+ 
+ #ifdef ARCH_CPU_X86_FAMILY
+   // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+   if (has_mmx)
+     EMMS();
+ #endif
+ }
+ 
++// Scale a frame of YUV to 32 bit ARGB.
++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
++                                const uint8* u_buf,
++                                const uint8* v_buf,
++                                uint8* rgb_buf,
++                                int width,
++                                int height,
++                                int scaled_width,
++                                int scaled_height,
++                                int y_pitch,
++                                int uv_pitch,
++                                int rgb_pitch,
++                                YUVType yuv_type,
++                                Rotate view_rotate) {
++  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
++  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
++  bool has_mmx = supports_mmx();
++  // Diagram showing origin and direction of source sampling.
++  // ->0   4<-
++  // 7       3
++  //
++  // 6       5
++  // ->1   2<-
++  // Rotations that start at right side of image.
++  if ((view_rotate == ROTATE_180) ||
++      (view_rotate == ROTATE_270) ||
++      (view_rotate == MIRROR_ROTATE_0) ||
++      (view_rotate == MIRROR_ROTATE_90)) {
++    y_buf += width - 1;
++    u_buf += width / 2 - 1;
++    v_buf += width / 2 - 1;
++    width = -width;
++  }
++  // Rotations that start at bottom of image.
++  if ((view_rotate == ROTATE_90) ||
++      (view_rotate == ROTATE_180) ||
++      (view_rotate == MIRROR_ROTATE_90) ||
++      (view_rotate == MIRROR_ROTATE_180)) {
++    y_buf += (height - 1) * y_pitch;
++    u_buf += ((height >> y_shift) - 1) * uv_pitch;
++    v_buf += ((height >> y_shift) - 1) * uv_pitch;
++    height = -height;
++  }
++
++  // Handle zero sized destination.
++  if (scaled_width == 0 || scaled_height == 0)
++    return;
++  int scaled_dx = width * 16 / scaled_width;
++  int scaled_dy = height * 16 / scaled_height;
++
++  int scaled_dx_uv = scaled_dx;
++
++  if ((view_rotate == ROTATE_90) ||
++      (view_rotate == ROTATE_270)) {
++    int tmp = scaled_height;
++    scaled_height = scaled_width;
++    scaled_width = tmp;
++    tmp = height;
++    height = width;
++    width = tmp;
++    int original_dx = scaled_dx;
++    int original_dy = scaled_dy;
++    scaled_dx = ((original_dy >> 4) * y_pitch) << 4;
++    scaled_dx_uv = ((original_dy >> 4) * uv_pitch) << 4;
++    scaled_dy = original_dx;
++    if (view_rotate == ROTATE_90) {
++      y_pitch = -1;
++      uv_pitch = -1;
++      height = -height;
++    } else {
++      y_pitch = 1;
++      uv_pitch = 1;
++    }
++  }
++
++  for (int y = 0; y < scaled_height; ++y) {
++    uint8* dest_pixel = rgb_buf + y * rgb_pitch;
++    int scaled_y = (y * height / scaled_height);
++    const uint8* y_ptr = y_buf + scaled_y * y_pitch;
++    const uint8* u_ptr = u_buf + (scaled_y >> y_shift) * uv_pitch;
++    const uint8* v_ptr = v_buf + (scaled_y >> y_shift) * uv_pitch;
++
++#if defined(_MSC_VER) && defined(_M_IX86)
++    if (scaled_width == (width * 2)) {
++      DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                          dest_pixel, scaled_width);
++    } else if ((scaled_dx & 15) == 0) {  // Scaling by integer scale factor.
++      if (scaled_dx_uv == scaled_dx) {   // Not rotated.
++        if (scaled_dx == 16) {           // Not scaled
++          if (has_mmx)
++            FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                                     dest_pixel, scaled_width);
++          else
++            FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                                      dest_pixel, scaled_width, x_shift);
++        } else {  // Simple scale down. ie half
++          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                               dest_pixel, scaled_width, scaled_dx >> 4);
++        }
++      } else {
++        RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                                   dest_pixel, scaled_width,
++                                   scaled_dx >> 4, scaled_dx_uv >> 4);
++      }
++#else
++    if (scaled_dx == 16) {           // Not scaled
++      if (has_mmx)
++        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                                 dest_pixel, scaled_width);
++      else
++        FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                                   dest_pixel, scaled_width, x_shift);
++#endif
++    } else {
++      if (has_mmx) 
++        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                           dest_pixel, scaled_width, scaled_dx);
++      else
++        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                             dest_pixel, scaled_width, scaled_dx, x_shift);
++
++    }  
++  }
++
++  // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
++  if (has_mmx)
++    EMMS();
++}
++
+ }  // namespace gfx
+ }  // namespace mozilla
+diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
+index c0b678d..a7e5b68 100644
+--- a/gfx/ycbcr/yuv_convert.h
++++ b/gfx/ycbcr/yuv_convert.h
+@@ -15,27 +15,56 @@ namespace gfx {
+ // Type of YUV surface.
+ // The value of these enums matter as they are used to shift vertical indices.
+ enum YUVType {
+   YV12 = 0,           // YV12 is half width and half height chroma channels.
+   YV16 = 1,           // YV16 is half width and full height chroma channels.
+   YV24 = 2            // YV24 is full width and full height chroma channels.
+ };
+ 
++// Mirror means flip the image horizontally, as in looking in a mirror.
++// Rotate happens after mirroring.
++enum Rotate {
++  ROTATE_0,           // Rotation off.
++  ROTATE_90,          // Rotate clockwise.
++  ROTATE_180,         // Rotate upside down.
++  ROTATE_270,         // Rotate counter clockwise.
++  MIRROR_ROTATE_0,    // Mirror horizontally.
++  MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
++  MIRROR_ROTATE_180,  // Mirror vertically.
++  MIRROR_ROTATE_270   // Transpose.
++};
++
+ // Convert a frame of YUV to 32 bit ARGB.
+ // Pass in YV16/YV12 depending on source format
+ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
+                                   const uint8* uplane,
+                                   const uint8* vplane,
+                                   uint8* rgbframe,
+                                   int pic_x,
+                                   int pic_y,
+                                   int pic_width,
+                                   int pic_height,
+                                   int ystride,
+                                   int uvstride,
+                                   int rgbstride,
+                                   YUVType yuv_type);
+ 
++// Scale a frame of YUV to 32 bit ARGB.
++// Supports rotation and mirroring.
++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
++                                const uint8* uplane,
++                                const uint8* vplane,
++                                uint8* rgbframe,
++                                int frame_width,
++                                int frame_height,
++                                int scaled_width,
++                                int scaled_height,
++                                int ystride,
++                                int uvstride,
++                                int rgbstride,
++                                YUVType yuv_type,
++                                Rotate view_rotate);
++
+ }  // namespace gfx
+ }  // namespace mozilla
+ 
+ #endif  // MEDIA_BASE_YUV_CONVERT_H_
+diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
+index 8519008..96969ec 100644
+--- a/gfx/ycbcr/yuv_row.h
++++ b/gfx/ycbcr/yuv_row.h
+@@ -24,16 +24,64 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+                                 unsigned int x_shift);
+ 
+ 
++// Can do 1x, half size or any scale down by an integer amount.
++// Step can be negative (mirroring, rotate 180).
++// This is the third fastest of the scalers.
++void ConvertYUVToRGB32Row(const uint8* y_buf,
++                          const uint8* u_buf,
++                          const uint8* v_buf,
++                          uint8* rgb_buf,
++                          int width,
++                          int step);
++
++// Rotate is like Convert, but applies different step to Y versus U and V.
++// This allows rotation by 90 or 270, by stepping by stride.
++// This is the forth fastest of the scalers.
++void RotateConvertYUVToRGB32Row(const uint8* y_buf,
++                                const uint8* u_buf,
++                                const uint8* v_buf,
++                                uint8* rgb_buf,
++                                int width,
++                                int ystep,
++                                int uvstep);
++
++// Doubler does 4 pixels at a time.  Each pixel is replicated.
++// This is the fastest of the scalers.
++void DoubleYUVToRGB32Row(const uint8* y_buf,
++                         const uint8* u_buf,
++                         const uint8* v_buf,
++                         uint8* rgb_buf,
++                         int width);
++
++// Handles arbitrary scaling up or down.
++// Mirroring is supported, but not 90 or 270 degree rotation.
++// Chroma is under sampled every 2 pixels for performance.
++// This is the slowest of the scalers.
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx);
++
++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
++                          const uint8* u_buf,
++                          const uint8* v_buf,
++                          uint8* rgb_buf,
++                          int width,
++                          int scaled_dx,
++                          unsigned int x_shift);
++
+ }  // extern "C"
+ 
+ // x64 uses MMX2 (SSE) so emms is not required.
+ #if defined(ARCH_CPU_X86)
+ #if defined(_MSC_VER)
+ #define EMMS() __asm emms
+ #else
+ #define EMMS() asm("emms")
+diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
+index b5c0018..49eced2 100644
+--- a/gfx/ycbcr/yuv_row_c.cpp
++++ b/gfx/ycbcr/yuv_row_c.cpp
+@@ -172,10 +172,31 @@ void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+         v = v_buf[x + 1];
+       }
+       YuvPixel(y1, u, v, rgb_buf + 4);
+     }
+     rgb_buf += 8;  // Advance 2 pixels.
+   }
+ }
+ 
++// 28.4 fixed point is used.  A shift by 4 isolates the integer.
++// A shift by 5 is used to further subsample the chrominence channels.
++// & 15 isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
++// for 1/4 pixel accurate interpolation.
++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx,
++                        unsigned int x_shift) {
++  int scaled_x = 0;
++  for (int x = 0; x < width; ++x) {
++    uint8 u = u_buf[scaled_x >> (4 + x_shift)];
++    uint8 v = v_buf[scaled_x >> (4 + x_shift)];
++    uint8 y0 = y_buf[scaled_x >> 4];
++    YuvPixel(y0, u, v, rgb_buf);
++    rgb_buf += 4;
++    scaled_x += scaled_dx;
++  }
++}
+ }  // extern "C"
+ 
+diff --git a/gfx/ycbcr/yuv_row_linux.cpp b/gfx/ycbcr/yuv_row_linux.cpp
+index 9f7625c..bff02b3 100644
+--- a/gfx/ycbcr/yuv_row_linux.cpp
++++ b/gfx/ycbcr/yuv_row_linux.cpp
+@@ -16,16 +16,24 @@ extern "C" {
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+ }
+  
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
++}
+ #else
+ 
+ #define RGBY(i) { \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   0 \
+ }
+@@ -365,16 +373,86 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
+     "r"(u_buf),  // %1
+     "r"(v_buf),  // %2
+     "r"(rgb_buf),  // %3
+     "r"(width),  // %4
+     "r" (kCoefficientsRgbY)  // %5
+   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+ }
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
++                        const uint8* u_buf,  // rsi
++                        const uint8* v_buf,  // rdx
++                        uint8* rgb_buf,      // rcx
++                        int width,           // r8
++                        int scaled_dx) {     // r9
++  asm(
++  "xor    %%r11,%%r11\n"
++  "sub    $0x2,%4\n"
++  "js     scalenext\n"
++
++"scaleloop:"
++  "mov    %%r11,%%r10\n"
++  "sar    $0x5,%%r10\n"
++  "movzb  (%1,%%r10,1),%%rax\n"
++  "movq   2048(%5,%%rax,8),%%xmm0\n"
++  "movzb  (%2,%%r10,1),%%rax\n"
++  "movq   4096(%5,%%rax,8),%%xmm1\n"
++  "lea    (%%r11,%6),%%r10\n"
++  "sar    $0x4,%%r11\n"
++  "movzb  (%0,%%r11,1),%%rax\n"
++  "paddsw %%xmm1,%%xmm0\n"
++  "movq   (%5,%%rax,8),%%xmm1\n"
++  "lea    (%%r10,%6),%%r11\n"
++  "sar    $0x4,%%r10\n"
++  "movzb  (%0,%%r10,1),%%rax\n"
++  "movq   (%5,%%rax,8),%%xmm2\n"
++  "paddsw %%xmm0,%%xmm1\n"
++  "paddsw %%xmm0,%%xmm2\n"
++  "shufps $0x44,%%xmm2,%%xmm1\n"
++  "psraw  $0x6,%%xmm1\n"
++  "packuswb %%xmm1,%%xmm1\n"
++  "movq   %%xmm1,0x0(%3)\n"
++  "add    $0x8,%3\n"
++  "sub    $0x2,%4\n"
++  "jns    scaleloop\n"
++
++"scalenext:"
++  "add    $0x1,%4\n"
++  "js     scaledone\n"
++
++  "mov    %%r11,%%r10\n"
++  "sar    $0x5,%%r10\n"
++  "movzb  (%1,%%r10,1),%%rax\n"
++  "movq   2048(%5,%%rax,8),%%xmm0\n"
++  "movzb  (%2,%%r10,1),%%rax\n"
++  "movq   4096(%5,%%rax,8),%%xmm1\n"
++  "paddsw %%xmm1,%%xmm0\n"
++  "sar    $0x4,%%r11\n"
++  "movzb  (%0,%%r11,1),%%rax\n"
++  "movq   (%5,%%rax,8),%%xmm1\n"
++  "paddsw %%xmm0,%%xmm1\n"
++  "psraw  $0x6,%%xmm1\n"
++  "packuswb %%xmm1,%%xmm1\n"
++  "movd   %%xmm1,0x0(%3)\n"
++
++"scaledone:"
++  :
++  : "r"(y_buf),  // %0
++    "r"(u_buf),  // %1
++    "r"(v_buf),  // %2
++    "r"(rgb_buf),  // %3
++    "r"(width),  // %4
++    "r" (kCoefficientsRgbY),  // %5
++    "r"(static_cast<long>(scaled_dx))  // %6
++  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
++);
++}
++
+ #endif // __SUNPRO_CC
+ 
+ #else // ARCH_CPU_X86_64
+ 
+ #ifdef __SUNPRO_CC
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+@@ -493,13 +571,87 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+   "packuswb %mm1,%mm1\n"
+   "movd   %mm1,0x0(%ebp)\n"
+ "2:"
+   "popa\n"
+   "ret\n"
+   ".previous\n"
+ );
+ 
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx);
++
++  asm(
++  ".global ScaleYUVToRGB32Row\n"
++"ScaleYUVToRGB32Row:\n"
++  "pusha\n"
++  "mov    0x24(%esp),%edx\n"
++  "mov    0x28(%esp),%edi\n"
++  "mov    0x2c(%esp),%esi\n"
++  "mov    0x30(%esp),%ebp\n"
++  "mov    0x34(%esp),%ecx\n"
++  "xor    %ebx,%ebx\n"
++  "jmp    scaleend\n"
++
++"scaleloop:"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
++  "paddsw %mm0,%mm1\n"
++  "paddsw %mm0,%mm2\n"
++  "psraw  $0x6,%mm1\n"
++  "psraw  $0x6,%mm2\n"
++  "packuswb %mm2,%mm1\n"
++  "movntq %mm1,0x0(%ebp)\n"
++  "add    $0x8,%ebp\n"
++"scaleend:"
++  "sub    $0x2,%ecx\n"
++  "jns    scaleloop\n"
++
++  "and    $0x1,%ecx\n"
++  "je     scaledone\n"
++
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
++  "paddsw %mm0,%mm1\n"
++  "psraw  $0x6,%mm1\n"
++  "packuswb %mm1,%mm1\n"
++  "movd   %mm1,0x0(%ebp)\n"
++
++"scaledone:"
++  "popa\n"
++  "ret\n"
++);
++
+ #endif // __SUNPRO_CC
+ #endif // ARCH_CPU_X86_64
+ #endif // !ARCH_CPU_X86_FAMILY
+ }  // extern "C"
+ 
+diff --git a/gfx/ycbcr/yuv_row_mac.cpp b/gfx/ycbcr/yuv_row_mac.cpp
+index a1d0058..5acf825 100644
+--- a/gfx/ycbcr/yuv_row_mac.cpp
++++ b/gfx/ycbcr/yuv_row_mac.cpp
+@@ -16,16 +16,24 @@ extern "C" {
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+ }
+  
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
++}
+ #else
+ 
+ #define RGBY(i) { \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   0 \
+ }
+@@ -313,11 +321,96 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   MacConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
+                           &kCoefficientsRgbY[0][0]);
+ }
+ 
++extern void MacScaleYUVToRGB32Row(const uint8* y_buf,
++                               const uint8* u_buf,
++                               const uint8* v_buf,
++                               uint8* rgb_buf,
++                               int width,
++                               int scaled_dx,
++                               int16 *kCoefficientsRgbY);
++
++  __asm__(
++"_MacScaleYUVToRGB32Row:\n"
++  "pusha\n"
++  "mov    0x24(%esp),%edx\n"
++  "mov    0x28(%esp),%edi\n"
++  "mov    0x2c(%esp),%esi\n"
++  "mov    0x30(%esp),%ebp\n"
++  "mov    0x3c(%esp),%ecx\n"
++  "xor    %ebx,%ebx\n"
++  "jmp    Lscaleend\n"
++
++"Lscaleloop:"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   2048(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw 4096(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   0(%ecx,%eax,8),%mm1\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   0(%ecx,%eax,8),%mm2\n"
++  "paddsw %mm0,%mm1\n"
++  "paddsw %mm0,%mm2\n"
++  "psraw  $0x6,%mm1\n"
++  "psraw  $0x6,%mm2\n"
++  "packuswb %mm2,%mm1\n"
++  "movntq %mm1,0x0(%ebp)\n"
++  "add    $0x8,%ebp\n"
++"Lscaleend:"
++  "sub    $0x2,0x34(%esp)\n"
++  "jns    Lscaleloop\n"
++
++  "and    $0x1,0x34(%esp)\n"
++  "je     Lscaledone\n"
++
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   2048(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw 4096(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   0(%ecx,%eax,8),%mm1\n"
++  "paddsw %mm0,%mm1\n"
++  "psraw  $0x6,%mm1\n"
++  "packuswb %mm1,%mm1\n"
++  "movd   %mm1,0x0(%ebp)\n"
++
++"Lscaledone:"
++  "popa\n"
++  "ret\n"
++);
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++
++  MacScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx,
++                        &kCoefficientsRgbY[0][0]);
++}
++
+ #endif // ARCH_CPU_PPC || ARCH_CPU_64_BITS
+ }  // extern "C"
+ 
+diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
+index 699ac77..a1700fc 100644
+--- a/gfx/ycbcr/yuv_row_win.cpp
++++ b/gfx/ycbcr/yuv_row_win.cpp
+@@ -11,17 +11,26 @@ extern "C" {
+ // PPC implementation uses C fallback
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+ }
+- 
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
++}
++
+ #else
+ 
+ 
+ #define RGBY(i) { \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   0 \
+@@ -307,11 +316,280 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+     movd      [ebp], mm1
+  convertdone :
+ 
+     popad
+     ret
+   }
+ }
+ 
++__declspec(naked)
++void ConvertYUVToRGB32Row(const uint8* y_buf,
++                          const uint8* u_buf,
++                          const uint8* v_buf,
++                          uint8* rgb_buf,
++                          int width,
++                          int step) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    mov       ebx, [esp + 32 + 24]  // step
++    jmp       wend
++
++ wloop :
++    movzx     eax, byte ptr [edi]
++    add       edi, ebx
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    add       esi, ebx
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    add       edx, ebx
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    add       edx, ebx
++    movq      mm2, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    paddsw    mm2, mm0
++    psraw     mm1, 6
++    psraw     mm2, 6
++    packuswb  mm1, mm2
++    movntq    [ebp], mm1
++    add       ebp, 8
++ wend :
++    sub       ecx, 2
++    jns       wloop
++
++    and       ecx, 1  // odd number of pixels?
++    jz        wdone
++
++    movzx     eax, byte ptr [edi]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    movd      [ebp], mm1
++ wdone :
++
++    popad
++    ret
++  }
++}
++
++__declspec(naked)
++void RotateConvertYUVToRGB32Row(const uint8* y_buf,
++                                const uint8* u_buf,
++                                const uint8* v_buf,
++                                uint8* rgb_buf,
++                                int width,
++                                int ystep,
++                                int uvstep) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    jmp       wend
++
++ wloop :
++    movzx     eax, byte ptr [edi]
++    mov       ebx, [esp + 32 + 28]  // uvstep
++    add       edi, ebx
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    add       esi, ebx
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    mov       ebx, [esp + 32 + 24]  // ystep
++    add       edx, ebx
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    add       edx, ebx
++    movq      mm2, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    paddsw    mm2, mm0
++    psraw     mm1, 6
++    psraw     mm2, 6
++    packuswb  mm1, mm2
++    movntq    [ebp], mm1
++    add       ebp, 8
++ wend :
++    sub       ecx, 2
++    jns       wloop
++
++    and       ecx, 1  // odd number of pixels?
++    jz        wdone
++
++    movzx     eax, byte ptr [edi]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    movd      [ebp], mm1
++ wdone :
++
++    popad
++    ret
++  }
++}
++
++__declspec(naked)
++void DoubleYUVToRGB32Row(const uint8* y_buf,
++                         const uint8* u_buf,
++                         const uint8* v_buf,
++                         uint8* rgb_buf,
++                         int width) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    jmp       wend
++
++ wloop :
++    movzx     eax, byte ptr [edi]
++    add       edi, 1
++    movzx     ebx, byte ptr [esi]
++    add       esi, 1
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    punpckldq mm1, mm1
++    movntq    [ebp], mm1
++
++    movzx     ebx, byte ptr [edx + 1]
++    add       edx, 2
++    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
++    psraw     mm0, 6
++    packuswb  mm0, mm0
++    punpckldq mm0, mm0
++    movntq    [ebp+8], mm0
++    add       ebp, 16
++ wend :
++    sub       ecx, 4
++    jns       wloop
++
++    add       ecx, 4
++    jz        wdone
++
++    movzx     eax, byte ptr [edi]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    jmp       wend1
++
++ wloop1 :
++    movd      [ebp], mm1
++    add       ebp, 4
++ wend1 :
++    sub       ecx, 1
++    jns       wloop1
++ wdone :
++    popad
++    ret
++  }
++}
++
++// This version does general purpose scaling by any amount, up or down.
++// The only thing it can not do it rotation by 90 or 270.
++// For performance the chroma is under sampled, reducing cost of a 3x
++// 1080p scale from 8.4 ms to 5.4 ms.
++__declspec(naked)
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int dx) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    xor       ebx, ebx              // x
++    jmp       scaleend
++
++ scaleloop :
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [edi + eax]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [esi + eax]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    mov       eax, ebx
++    add       ebx, [esp + 32 + 24]  // x += dx
++    sar       eax, 4
++    movzx     eax, byte ptr [edx + eax]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    mov       eax, ebx
++    add       ebx, [esp + 32 + 24]  // x += dx
++    sar       eax, 4
++    movzx     eax, byte ptr [edx + eax]
++    movq      mm2, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    paddsw    mm2, mm0
++    psraw     mm1, 6
++    psraw     mm2, 6
++    packuswb  mm1, mm2
++    movntq    [ebp], mm1
++    add       ebp, 8
++ scaleend :
++    sub       ecx, 2
++    jns       scaleloop
++
++    and       ecx, 1  // odd number of pixels?
++    jz        scaledone
++
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [edi + eax]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [esi + eax]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    mov       eax, ebx
++    sar       eax, 4
++    movzx     eax, byte ptr [edx + eax]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    movd      [ebp], mm1
++
++ scaledone :
++    popad
++    ret
++  }
++}
++
+ #endif // ARCH_CPU_64_BITS
+ }  // extern "C"
+ 
--- a/gfx/ycbcr/update.sh
+++ b/gfx/ycbcr/update.sh
@@ -10,8 +10,9 @@ patch -p3 <convert.patch
 patch -p3 <picture_region.patch
 patch -p3 <remove_scale.patch
 patch -p3 <export.patch
 patch -p3 <win64_mac64.patch
 patch -p3 <yv24.patch
 patch -p3 <row_c_fix.patch
 patch -p3 <bug572034_mac_64bit.patch
 patch -p3 <bug577645_movntq.patch
+patch -p3 <add_scale.patch
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -84,10 +84,139 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const 
 
 #ifdef ARCH_CPU_X86_FAMILY
   // SSE used for FastConvertYUVToRGB32Row requires emms instruction.
   if (has_sse)
     EMMS();
 #endif
 }
 
+// Scale a frame of YUV to 32 bit ARGB.
+NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int height,
+                                int scaled_width,
+                                int scaled_height,
+                                int y_pitch,
+                                int uv_pitch,
+                                int rgb_pitch,
+                                YUVType yuv_type,
+                                Rotate view_rotate) {
+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
+  bool has_mmx = supports_mmx();
+  // Diagram showing origin and direction of source sampling.
+  // ->0   4<-
+  // 7       3
+  //
+  // 6       5
+  // ->1   2<-
+  // Rotations that start at right side of image.
+  if ((view_rotate == ROTATE_180) ||
+      (view_rotate == ROTATE_270) ||
+      (view_rotate == MIRROR_ROTATE_0) ||
+      (view_rotate == MIRROR_ROTATE_90)) {
+    y_buf += width - 1;
+    u_buf += width / 2 - 1;
+    v_buf += width / 2 - 1;
+    width = -width;
+  }
+  // Rotations that start at bottom of image.
+  if ((view_rotate == ROTATE_90) ||
+      (view_rotate == ROTATE_180) ||
+      (view_rotate == MIRROR_ROTATE_90) ||
+      (view_rotate == MIRROR_ROTATE_180)) {
+    y_buf += (height - 1) * y_pitch;
+    u_buf += ((height >> y_shift) - 1) * uv_pitch;
+    v_buf += ((height >> y_shift) - 1) * uv_pitch;
+    height = -height;
+  }
+
+  // Handle zero sized destination.
+  if (scaled_width == 0 || scaled_height == 0)
+    return;
+  int scaled_dx = width * 16 / scaled_width;
+  int scaled_dy = height * 16 / scaled_height;
+
+  int scaled_dx_uv = scaled_dx;
+
+  if ((view_rotate == ROTATE_90) ||
+      (view_rotate == ROTATE_270)) {
+    int tmp = scaled_height;
+    scaled_height = scaled_width;
+    scaled_width = tmp;
+    tmp = height;
+    height = width;
+    width = tmp;
+    int original_dx = scaled_dx;
+    int original_dy = scaled_dy;
+    scaled_dx = ((original_dy >> 4) * y_pitch) << 4;
+    scaled_dx_uv = ((original_dy >> 4) * uv_pitch) << 4;
+    scaled_dy = original_dx;
+    if (view_rotate == ROTATE_90) {
+      y_pitch = -1;
+      uv_pitch = -1;
+      height = -height;
+    } else {
+      y_pitch = 1;
+      uv_pitch = 1;
+    }
+  }
+
+  for (int y = 0; y < scaled_height; ++y) {
+    uint8* dest_pixel = rgb_buf + y * rgb_pitch;
+    int scaled_y = (y * height / scaled_height);
+    const uint8* y_ptr = y_buf + scaled_y * y_pitch;
+    const uint8* u_ptr = u_buf + (scaled_y >> y_shift) * uv_pitch;
+    const uint8* v_ptr = v_buf + (scaled_y >> y_shift) * uv_pitch;
+
+#if defined(_MSC_VER) && defined(_M_IX86)
+    if (scaled_width == (width * 2)) {
+      DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                          dest_pixel, scaled_width);
+    } else if ((scaled_dx & 15) == 0) {  // Scaling by integer scale factor.
+      if (scaled_dx_uv == scaled_dx) {   // Not rotated.
+        if (scaled_dx == 16) {           // Not scaled
+          if (has_mmx)
+            FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                     dest_pixel, scaled_width);
+          else
+            FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                                      dest_pixel, scaled_width, x_shift);
+        } else {  // Simple scale down. ie half
+          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                               dest_pixel, scaled_width, scaled_dx >> 4);
+        }
+      } else {
+        RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, scaled_width,
+                                   scaled_dx >> 4, scaled_dx_uv >> 4);
+      }
+#else
+    if (scaled_dx == 16) {           // Not scaled
+      if (has_mmx)
+        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                 dest_pixel, scaled_width);
+      else
+        FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, scaled_width, x_shift);
+#endif
+    } else {
+      if (has_mmx) 
+        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                           dest_pixel, scaled_width, scaled_dx);
+      else
+        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                             dest_pixel, scaled_width, scaled_dx, x_shift);
+
+    }  
+  }
+
+  // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+  if (has_mmx)
+    EMMS();
+}
+
 }  // namespace gfx
 }  // namespace mozilla
--- a/gfx/ycbcr/yuv_convert.h
+++ b/gfx/ycbcr/yuv_convert.h
@@ -15,27 +15,56 @@ namespace gfx {
 // Type of YUV surface.
 // The value of these enums matter as they are used to shift vertical indices.
 enum YUVType {
   YV12 = 0,           // YV12 is half width and half height chroma channels.
   YV16 = 1,           // YV16 is half width and full height chroma channels.
   YV24 = 2            // YV24 is full width and full height chroma channels.
 };
 
+// Mirror means flip the image horizontally, as in looking in a mirror.
+// Rotate happens after mirroring.
+enum Rotate {
+  ROTATE_0,           // Rotation off.
+  ROTATE_90,          // Rotate clockwise.
+  ROTATE_180,         // Rotate upside down.
+  ROTATE_270,         // Rotate counter clockwise.
+  MIRROR_ROTATE_0,    // Mirror horizontally.
+  MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
+  MIRROR_ROTATE_180,  // Mirror vertically.
+  MIRROR_ROTATE_270   // Transpose.
+};
+
 // Convert a frame of YUV to 32 bit ARGB.
 // Pass in YV16/YV12 depending on source format
 NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
                                   const uint8* uplane,
                                   const uint8* vplane,
                                   uint8* rgbframe,
                                   int pic_x,
                                   int pic_y,
                                   int pic_width,
                                   int pic_height,
                                   int ystride,
                                   int uvstride,
                                   int rgbstride,
                                   YUVType yuv_type);
 
+// Scale a frame of YUV to 32 bit ARGB.
+// Supports rotation and mirroring.
+NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
+                                const uint8* uplane,
+                                const uint8* vplane,
+                                uint8* rgbframe,
+                                int frame_width,
+                                int frame_height,
+                                int scaled_width,
+                                int scaled_height,
+                                int ystride,
+                                int uvstride,
+                                int rgbstride,
+                                YUVType yuv_type,
+                                Rotate view_rotate);
+
 }  // namespace gfx
 }  // namespace mozilla
 
 #endif  // MEDIA_BASE_YUV_CONVERT_H_
--- a/gfx/ycbcr/yuv_row.h
+++ b/gfx/ycbcr/yuv_row.h
@@ -24,16 +24,64 @@ void FastConvertYUVToRGB32Row(const uint
 void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width,
                                 unsigned int x_shift);
 
 
+// Can do 1x, half size or any scale down by an integer amount.
+// Step can be negative (mirroring, rotate 180).
+// This is the third fastest of the scalers.
+void ConvertYUVToRGB32Row(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int step);
+
+// Rotate is like Convert, but applies different step to Y versus U and V.
+// This allows rotation by 90 or 270, by stepping by stride.
+// This is the forth fastest of the scalers.
+void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int ystep,
+                                int uvstep);
+
+// Doubler does 4 pixels at a time.  Each pixel is replicated.
+// This is the fastest of the scalers.
+void DoubleYUVToRGB32Row(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width);
+
+// Handles arbitrary scaling up or down.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+// This is the slowest of the scalers.
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int scaled_dx,
+                          unsigned int x_shift);
+
 }  // extern "C"
 
 // x64 uses MMX2 (SSE) so emms is not required.
 #if defined(ARCH_CPU_X86)
 #if defined(_MSC_VER)
 #define EMMS() __asm emms
 #else
 #define EMMS() asm("emms")
--- a/gfx/ycbcr/yuv_row_c.cpp
+++ b/gfx/ycbcr/yuv_row_c.cpp
@@ -172,10 +172,31 @@ void FastConvertYUVToRGB32Row_C(const ui
         v = v_buf[x + 1];
       }
       YuvPixel(y1, u, v, rgb_buf + 4);
     }
     rgb_buf += 8;  // Advance 2 pixels.
   }
 }
 
+// 28.4 fixed point is used.  A shift by 4 isolates the integer.
+// A shift by 5 is used to further subsample the chrominence channels.
+// & 15 isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
+// for 1/4 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx,
+                        unsigned int x_shift) {
+  int scaled_x = 0;
+  for (int x = 0; x < width; ++x) {
+    uint8 u = u_buf[scaled_x >> (4 + x_shift)];
+    uint8 v = v_buf[scaled_x >> (4 + x_shift)];
+    uint8 y0 = y_buf[scaled_x >> 4];
+    YuvPixel(y0, u, v, rgb_buf);
+    rgb_buf += 4;
+    scaled_x += scaled_dx;
+  }
+}
 }  // extern "C"
 
--- a/gfx/ycbcr/yuv_row_linux.cpp
+++ b/gfx/ycbcr/yuv_row_linux.cpp
@@ -16,16 +16,24 @@ extern "C" {
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
  
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
+}
 #else
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
 }
@@ -365,16 +373,86 @@ void FastConvertYUVToRGB32Row(const uint
     "r"(u_buf),  // %1
     "r"(v_buf),  // %2
     "r"(rgb_buf),  // %3
     "r"(width),  // %4
     "r" (kCoefficientsRgbY)  // %5
   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
 );
 }
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
+                        const uint8* u_buf,  // rsi
+                        const uint8* v_buf,  // rdx
+                        uint8* rgb_buf,      // rcx
+                        int width,           // r8
+                        int scaled_dx) {     // r9
+  asm(
+  "xor    %%r11,%%r11\n"
+  "sub    $0x2,%4\n"
+  "js     scalenext\n"
+
+"scaleloop:"
+  "mov    %%r11,%%r10\n"
+  "sar    $0x5,%%r10\n"
+  "movzb  (%1,%%r10,1),%%rax\n"
+  "movq   2048(%5,%%rax,8),%%xmm0\n"
+  "movzb  (%2,%%r10,1),%%rax\n"
+  "movq   4096(%5,%%rax,8),%%xmm1\n"
+  "lea    (%%r11,%6),%%r10\n"
+  "sar    $0x4,%%r11\n"
+  "movzb  (%0,%%r11,1),%%rax\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%rax,8),%%xmm1\n"
+  "lea    (%%r10,%6),%%r11\n"
+  "sar    $0x4,%%r10\n"
+  "movzb  (%0,%%r10,1),%%rax\n"
+  "movq   (%5,%%rax,8),%%xmm2\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "shufps $0x44,%%xmm2,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movq   %%xmm1,0x0(%3)\n"
+  "add    $0x8,%3\n"
+  "sub    $0x2,%4\n"
+  "jns    scaleloop\n"
+
+"scalenext:"
+  "add    $0x1,%4\n"
+  "js     scaledone\n"
+
+  "mov    %%r11,%%r10\n"
+  "sar    $0x5,%%r10\n"
+  "movzb  (%1,%%r10,1),%%rax\n"
+  "movq   2048(%5,%%rax,8),%%xmm0\n"
+  "movzb  (%2,%%r10,1),%%rax\n"
+  "movq   4096(%5,%%rax,8),%%xmm1\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "sar    $0x4,%%r11\n"
+  "movzb  (%0,%%r11,1),%%rax\n"
+  "movq   (%5,%%rax,8),%%xmm1\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movd   %%xmm1,0x0(%3)\n"
+
+"scaledone:"
+  :
+  : "r"(y_buf),  // %0
+    "r"(u_buf),  // %1
+    "r"(v_buf),  // %2
+    "r"(rgb_buf),  // %3
+    "r"(width),  // %4
+    "r" (kCoefficientsRgbY),  // %5
+    "r"(static_cast<long>(scaled_dx))  // %6
+  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
 #endif // __SUNPRO_CC
 
 #else // ARCH_CPU_X86_64
 
 #ifdef __SUNPRO_CC
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
@@ -493,13 +571,87 @@ void FastConvertYUVToRGB32Row(const uint
   "packuswb %mm1,%mm1\n"
   "movd   %mm1,0x0(%ebp)\n"
 "2:"
   "popa\n"
   "ret\n"
   ".previous\n"
 );
 
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx);
+
+  asm(
+  ".global ScaleYUVToRGB32Row\n"
+"ScaleYUVToRGB32Row:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+  "xor    %ebx,%ebx\n"
+  "jmp    scaleend\n"
+
+"scaleloop:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"scaleend:"
+  "sub    $0x2,%ecx\n"
+  "jns    scaleloop\n"
+
+  "and    $0x1,%ecx\n"
+  "je     scaledone\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+
+"scaledone:"
+  "popa\n"
+  "ret\n"
+);
+
 #endif // __SUNPRO_CC
 #endif // ARCH_CPU_X86_64
 #endif // !ARCH_CPU_X86_FAMILY
 }  // extern "C"
 
--- a/gfx/ycbcr/yuv_row_mac.cpp
+++ b/gfx/ycbcr/yuv_row_mac.cpp
@@ -16,16 +16,24 @@ extern "C" {
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
  
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
+}
 #else
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
 }
@@ -313,11 +321,96 @@ void FastConvertYUVToRGB32Row(const uint
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   MacConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
                           &kCoefficientsRgbY[0][0]);
 }
 
+extern void MacScaleYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width,
+                               int scaled_dx,
+                               int16 *kCoefficientsRgbY);
+
+  __asm__(
+"_MacScaleYUVToRGB32Row:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x3c(%esp),%ecx\n"
+  "xor    %ebx,%ebx\n"
+  "jmp    Lscaleend\n"
+
+"Lscaleloop:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"Lscaleend:"
+  "sub    $0x2,0x34(%esp)\n"
+  "jns    Lscaleloop\n"
+
+  "and    $0x1,0x34(%esp)\n"
+  "je     Lscaledone\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+
+"Lscaledone:"
+  "popa\n"
+  "ret\n"
+);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+
+  MacScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx,
+                        &kCoefficientsRgbY[0][0]);
+}
+
 #endif // ARCH_CPU_PPC || ARCH_CPU_64_BITS
 }  // extern "C"
 
--- a/gfx/ycbcr/yuv_row_win.cpp
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -11,17 +11,26 @@ extern "C" {
 // PPC implementation uses C fallback
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
- 
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
+}
+
 #else
 
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
@@ -307,11 +316,280 @@ void FastConvertYUVToRGB32Row(const uint
     movd      [ebp], mm1
  convertdone :
 
     popad
     ret
   }
 }
 
+__declspec(naked)
+void ConvertYUVToRGB32Row(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int step) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    mov       ebx, [esp + 32 + 24]  // step
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, ebx
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    add       esi, ebx
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ wend :
+    sub       ecx, 2
+    jns       wloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ wdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int ystep,
+                                int uvstep) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    mov       ebx, [esp + 32 + 28]  // uvstep
+    add       edi, ebx
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    add       esi, ebx
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    mov       ebx, [esp + 32 + 24]  // ystep
+    add       edx, ebx
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ wend :
+    sub       ecx, 2
+    jns       wloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ wdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void DoubleYUVToRGB32Row(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, 1
+    movzx     ebx, byte ptr [esi]
+    add       esi, 1
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    punpckldq mm1, mm1
+    movntq    [ebp], mm1
+
+    movzx     ebx, byte ptr [edx + 1]
+    add       edx, 2
+    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
+    psraw     mm0, 6
+    packuswb  mm0, mm0
+    punpckldq mm0, mm0
+    movntq    [ebp+8], mm0
+    add       ebp, 16
+ wend :
+    sub       ecx, 4
+    jns       wloop
+
+    add       ecx, 4
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    jmp       wend1
+
+ wloop1 :
+    movd      [ebp], mm1
+    add       ebp, 4
+ wend1 :
+    sub       ecx, 1
+    jns       wloop1
+ wdone :
+    popad
+    ret
+  }
+}
+
+// This version does general purpose scaling by any amount, up or down.
+// The only thing it can not do it rotation by 90 or 270.
+// For performance the chroma is under sampled, reducing cost of a 3x
+// 1080p scale from 8.4 ms to 5.4 ms.
+__declspec(naked)
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int dx) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    xor       ebx, ebx              // x
+    jmp       scaleend
+
+ scaleloop :
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [edi + eax]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [esi + eax]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]  // x += dx
+    sar       eax, 4
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]  // x += dx
+    sar       eax, 4
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ scaleend :
+    sub       ecx, 2
+    jns       scaleloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        scaledone
+
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [edi + eax]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [esi + eax]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 4
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+
+ scaledone :
+    popad
+    ret
+  }
+}
+
 #endif // ARCH_CPU_64_BITS
 }  // extern "C"
 
--- a/layout/generic/nsVideoFrame.cpp
+++ b/layout/generic/nsVideoFrame.cpp
@@ -246,16 +246,20 @@ nsVideoFrame::BuildLayer(nsDisplayListBu
   // the largest rectangle that fills our content-box and has the
   // correct aspect ratio.
   nsPresContext* presContext = PresContext();
   gfxRect r = gfxRect(presContext->AppUnitsToGfxUnits(area.x),
                       presContext->AppUnitsToGfxUnits(area.y),
                       presContext->AppUnitsToGfxUnits(area.width),
                       presContext->AppUnitsToGfxUnits(area.height));
   r = CorrectForAspectRatio(r, videoSize);
+  r.Round();
+  gfxIntSize scaleHint(static_cast<PRInt32>(r.Width()),
+                       static_cast<PRInt32>(r.Height()));
+  container->SetScaleHint(scaleHint);
 
   nsRefPtr<ImageLayer> layer = static_cast<ImageLayer*>
     (aBuilder->LayerBuilder()->GetLeafLayerFor(aBuilder, aManager, aItem));
   if (!layer) {
     layer = aManager->CreateImageLayer();
     if (!layer)
       return nsnull;
   }