Bug 577843 - Scale videos at YCbCr conversion time - r=roc a=blocking2.0
authorChris Double <chris.double@double.co.nz>
Tue, 26 Oct 2010 16:11:13 +1300
changeset 57389 8ecd9dc6684e04dd6a3d37b0bc8f40c5847e4e7a
parent 57388 59bbc730aee4b5bfb49fabac9633efadeb72c101
child 57390 64901a1fcf9339c7497c0f203d9df1f95df3c738
push id16900
push usercdouble@mozilla.com
push dateFri, 12 Nov 2010 03:14:11 +0000
treeherdermozilla-central@d2d645506534 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersroc, blocking2
bugs577843
milestone2.0b8pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 577843 - Scale videos at YCbCr conversion time - r=roc a=blocking2.0
content/media/nsMediaDecoder.cpp
gfx/layers/ImageLayers.h
gfx/layers/basic/BasicImages.cpp
gfx/ycbcr/README
gfx/ycbcr/add_scale.patch
gfx/ycbcr/update.sh
gfx/ycbcr/yuv_convert.cpp
gfx/ycbcr/yuv_convert.h
gfx/ycbcr/yuv_row.h
gfx/ycbcr/yuv_row_c.cpp
gfx/ycbcr/yuv_row_linux.cpp
gfx/ycbcr/yuv_row_mac.cpp
gfx/ycbcr/yuv_row_win.cpp
layout/generic/nsVideoFrame.cpp
--- a/content/media/nsMediaDecoder.cpp
+++ b/content/media/nsMediaDecoder.cpp
@@ -45,28 +45,23 @@
 #include "nsIDocument.h"
 #include "nsThreadUtils.h"
 #include "nsIDOMHTMLMediaElement.h"
 #include "nsNetUtil.h"
 #include "nsHTMLMediaElement.h"
 #include "nsAutoLock.h"
 #include "nsIRenderingContext.h"
 #include "gfxContext.h"
-#include "gfxImageSurface.h"
 #include "nsPresContext.h"
 #include "nsDOMError.h"
 #include "nsDisplayList.h"
 #ifdef MOZ_SVG
 #include "nsSVGEffects.h"
 #endif
 
-#if defined(XP_MACOSX)
-#include "gfxQuartzImageSurface.h"
-#endif
-
 // Number of milliseconds between progress events as defined by spec
 #define PROGRESS_MS 350
 
 // Number of milliseconds of no data before a stall event is fired as defined by spec
 #define STALL_MS 3000
 
 // Number of milliseconds between timeupdate events as defined by spec
 #define TIMEUPDATE_MS 250
--- a/gfx/layers/ImageLayers.h
+++ b/gfx/layers/ImageLayers.h
@@ -109,16 +109,17 @@ protected:
  * (because layers can only be used on the main thread) and we want to
  * be able to set the current Image from any thread, to facilitate
  * video playback without involving the main thread, for example.
  */
 class THEBES_API ImageContainer {
   THEBES_INLINE_DECL_THREADSAFE_REFCOUNTING(ImageContainer)
 
 public:
+  ImageContainer() {}
   virtual ~ImageContainer() {}
 
   /**
    * Create an Image in one of the given formats.
    * Picks the "best" format from the list and creates an Image of that
    * format.
    * Returns null if this backend does not support any of the formats.
    */
@@ -174,16 +175,23 @@ public:
 
   /**
    * Set a new layer manager for this image container.  It must be
    * either of the same type as the container's current layer manager,
    * or null.  TRUE is returned on success.
    */
   virtual PRBool SetLayerManager(LayerManager *aManager) = 0;
 
+  /**
+   * Sets a size that the image is expected to be rendered at.
+   * This is a hint for image backends to optimize scaling.
+   * Default implementation in this class is to ignore the hint.
+   */
+  virtual void SetScaleHint(const gfxIntSize& /* aScaleHint */) { }
+
 protected:
   LayerManager* mManager;
 
   ImageContainer(LayerManager* aManager) : mManager(aManager) {}
 };
 
 /**
  * A Layer which renders an Image.
--- a/gfx/layers/basic/BasicImages.cpp
+++ b/gfx/layers/basic/BasicImages.cpp
@@ -99,39 +99,50 @@ protected:
 
 /**
  * We handle YCbCr by converting to RGB when the image is initialized
  * (which should be done off the main thread). The RGB results are stored
  * in a memory buffer and converted to a cairo surface lazily.
  */
 class BasicPlanarYCbCrImage : public PlanarYCbCrImage, public BasicImageImplData {
 public:
-  BasicPlanarYCbCrImage() :
-    PlanarYCbCrImage(static_cast<BasicImageImplData*>(this))
+   /** 
+    * aScaleHint is a size that the image is expected to be rendered at.
+    * This is a hint for image backends to optimize scaling.
+    */
+  BasicPlanarYCbCrImage(const gfxIntSize& aScaleHint) :
+    PlanarYCbCrImage(static_cast<BasicImageImplData*>(this)),
+    mScaleHint(aScaleHint)
     {}
 
   virtual void SetData(const Data& aData);
 
   virtual already_AddRefed<gfxASurface> GetAsSurface();
 
 protected:
   nsAutoArrayPtr<PRUint8>              mBuffer;
   nsCountedRef<nsMainThreadSurfaceRef> mSurface;
+  gfxIntSize                           mScaleHint;
 };
 
 void
 BasicPlanarYCbCrImage::SetData(const Data& aData)
 {
   // Do some sanity checks to prevent integer overflow
   if (aData.mYSize.width > 16384 || aData.mYSize.height > 16384) {
     NS_ERROR("Illegal width or height");
     return;
   }
-  size_t size = aData.mPicSize.width*aData.mPicSize.height*4;
-  mBuffer = new PRUint8[size];
+  // 'prescale' is true if the scaling is to be done as part of the
+  // YCbCr to RGB conversion rather than on the RGB data when rendered.
+  PRBool prescale = mScaleHint.width > 0 && mScaleHint.height > 0;
+  gfxIntSize size(prescale ? mScaleHint.width : aData.mPicSize.width,
+                  prescale ? mScaleHint.height : aData.mPicSize.height);
+
+  mBuffer = new PRUint8[size.width * size.height * 4];
   if (!mBuffer) {
     // out of memory
     return;
   }
 
   gfx::YUVType type = gfx::YV12;
   if (aData.mYSize.width == aData.mCbCrSize.width &&
       aData.mYSize.height == aData.mCbCrSize.height) {
@@ -144,30 +155,47 @@ BasicPlanarYCbCrImage::SetData(const Dat
   else if (aData.mYSize.width / 2 == aData.mCbCrSize.width &&
            aData.mYSize.height / 2 == aData.mCbCrSize.height ) {
     type = gfx::YV12;
   }
   else {
     NS_ERROR("YCbCr format not supported");
   }
  
-  // Convert from YCbCr to RGB now
-  gfx::ConvertYCbCrToRGB32(aData.mYChannel,
+  // Convert from YCbCr to RGB now, scaling the image if needed.
+  if (size != aData.mPicSize) {
+    gfx::ScaleYCbCrToRGB32(aData.mYChannel,
                            aData.mCbChannel,
                            aData.mCrChannel,
                            mBuffer,
-                           aData.mPicX,
-                           aData.mPicY,
                            aData.mPicSize.width,
                            aData.mPicSize.height,
+                           size.width,
+                           size.height,
                            aData.mYStride,
                            aData.mCbCrStride,
-                           aData.mPicSize.width*4,
-                           type);                                                          
-  mSize = aData.mPicSize;
+                           size.width*4,
+                           type,
+                           gfx::ROTATE_0);
+  }
+  else {
+    gfx::ConvertYCbCrToRGB32(aData.mYChannel,
+                             aData.mCbChannel,
+                             aData.mCrChannel,
+                             mBuffer,
+                             aData.mPicX,
+                             aData.mPicY,
+                             aData.mPicSize.width,
+                             aData.mPicSize.height,
+                             aData.mYStride,
+                             aData.mCbCrStride,
+                             aData.mPicSize.width*4,
+                             type);                                                          
+  }
+  mSize = size;
 }
 
 static cairo_user_data_key_t imageSurfaceDataKey;
 
 static void
 DestroyBuffer(void* aBuffer)
 {
   delete[] static_cast<PRUint8*>(aBuffer);
@@ -213,29 +241,32 @@ BasicPlanarYCbCrImage::GetAsSurface()
 /**
  * Our image container is very simple. It's really just a factory
  * for the image objects. We use a Monitor to synchronize access to
  * mImage.
  */
 class BasicImageContainer : public ImageContainer {
 public:
   BasicImageContainer(BasicLayerManager* aManager) :
-    ImageContainer(aManager), mMonitor("BasicImageContainer")
+    ImageContainer(aManager), mMonitor("BasicImageContainer"),
+    mScaleHint(-1, -1)
   {}
   virtual already_AddRefed<Image> CreateImage(const Image::Format* aFormats,
                                               PRUint32 aNumFormats);
   virtual void SetCurrentImage(Image* aImage);
   virtual already_AddRefed<Image> GetCurrentImage();
   virtual already_AddRefed<gfxASurface> GetCurrentAsSurface(gfxIntSize* aSize);
   virtual gfxIntSize GetCurrentSize();
   virtual PRBool SetLayerManager(LayerManager *aManager);
+  virtual void SetScaleHint(const gfxIntSize& aScaleHint);
 
 protected:
   Monitor mMonitor;
   nsRefPtr<Image> mImage;
+  gfxIntSize mScaleHint;
 };
 
 /**
  * Returns true if aFormat is in the given format array.
  */
 static PRBool
 FormatInList(const Image::Format* aFormats, PRUint32 aNumFormats,
              Image::Format aFormat)
@@ -252,17 +283,18 @@ already_AddRefed<Image>
 BasicImageContainer::CreateImage(const Image::Format* aFormats,
                                  PRUint32 aNumFormats)
 {
   nsRefPtr<Image> image;
   // Prefer cairo surfaces because they're native for us
   if (FormatInList(aFormats, aNumFormats, Image::CAIRO_SURFACE)) {
     image = new BasicCairoImage();
   } else if (FormatInList(aFormats, aNumFormats, Image::PLANAR_YCBCR)) {
-    image = new BasicPlanarYCbCrImage();
+    MonitorAutoEnter mon(mMonitor);
+    image = new BasicPlanarYCbCrImage(mScaleHint);
   }
   return image.forget();
 }
 
 void
 BasicImageContainer::SetCurrentImage(Image* aImage)
 {
   MonitorAutoEnter mon(mMonitor);
@@ -298,16 +330,22 @@ BasicImageContainer::GetCurrentAsSurface
 
 gfxIntSize
 BasicImageContainer::GetCurrentSize()
 {
   MonitorAutoEnter mon(mMonitor);
   return !mImage ? gfxIntSize(0,0) : ToImageData(mImage)->GetSize();
 }
 
+void BasicImageContainer::SetScaleHint(const gfxIntSize& aScaleHint)
+{
+  MonitorAutoEnter mon(mMonitor);
+  mScaleHint = aScaleHint;
+}
+
 PRBool
 BasicImageContainer::SetLayerManager(LayerManager *aManager)
 {
   if (aManager &&
       aManager->GetBackendType() != LayerManager::LAYERS_BASIC)
   {
     return PR_FALSE;
   }
--- a/gfx/ycbcr/README
+++ b/gfx/ycbcr/README
@@ -16,8 +16,9 @@ picture_region.patch: Change Chromium co
 
 remove_scale.patch: Removes Chromium scaling code.
 export.patch: Fix export for building on comm-central
 win64_mac64.patch: Fallback to C implementation on Windows and Mac OS X 64 bit
 yv24.patch: Adds YCbCr 4:4:4 support
 row_c_fix.patch: Fix broken C fallback code (See bug 561385).
 bug572034_mac_64bit.patch: Fix x86_64 linux code so it works on OS X.
 solaris.patch: Adds Solaris support, fallback to C implementation on SPARC
+add_scale.patch: re-adds Chromium scaling code
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/add_scale.patch
@@ -0,0 +1,953 @@
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+index 40ce10f..7d46629 100644
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -82,10 +82,139 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
+ 
+ #ifdef ARCH_CPU_X86_FAMILY
+   // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+   if (has_mmx)
+     EMMS();
+ #endif
+ }
+ 
++// Scale a frame of YUV to 32 bit ARGB.
++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
++                                const uint8* u_buf,
++                                const uint8* v_buf,
++                                uint8* rgb_buf,
++                                int width,
++                                int height,
++                                int scaled_width,
++                                int scaled_height,
++                                int y_pitch,
++                                int uv_pitch,
++                                int rgb_pitch,
++                                YUVType yuv_type,
++                                Rotate view_rotate) {
++  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
++  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
++  bool has_mmx = supports_mmx();
++  // Diagram showing origin and direction of source sampling.
++  // ->0   4<-
++  // 7       3
++  //
++  // 6       5
++  // ->1   2<-
++  // Rotations that start at right side of image.
++  if ((view_rotate == ROTATE_180) ||
++      (view_rotate == ROTATE_270) ||
++      (view_rotate == MIRROR_ROTATE_0) ||
++      (view_rotate == MIRROR_ROTATE_90)) {
++    y_buf += width - 1;
++    u_buf += width / 2 - 1;
++    v_buf += width / 2 - 1;
++    width = -width;
++  }
++  // Rotations that start at bottom of image.
++  if ((view_rotate == ROTATE_90) ||
++      (view_rotate == ROTATE_180) ||
++      (view_rotate == MIRROR_ROTATE_90) ||
++      (view_rotate == MIRROR_ROTATE_180)) {
++    y_buf += (height - 1) * y_pitch;
++    u_buf += ((height >> y_shift) - 1) * uv_pitch;
++    v_buf += ((height >> y_shift) - 1) * uv_pitch;
++    height = -height;
++  }
++
++  // Handle zero sized destination.
++  if (scaled_width == 0 || scaled_height == 0)
++    return;
++  int scaled_dx = width * 16 / scaled_width;
++  int scaled_dy = height * 16 / scaled_height;
++
++  int scaled_dx_uv = scaled_dx;
++
++  if ((view_rotate == ROTATE_90) ||
++      (view_rotate == ROTATE_270)) {
++    int tmp = scaled_height;
++    scaled_height = scaled_width;
++    scaled_width = tmp;
++    tmp = height;
++    height = width;
++    width = tmp;
++    int original_dx = scaled_dx;
++    int original_dy = scaled_dy;
++    scaled_dx = ((original_dy >> 4) * y_pitch) << 4;
++    scaled_dx_uv = ((original_dy >> 4) * uv_pitch) << 4;
++    scaled_dy = original_dx;
++    if (view_rotate == ROTATE_90) {
++      y_pitch = -1;
++      uv_pitch = -1;
++      height = -height;
++    } else {
++      y_pitch = 1;
++      uv_pitch = 1;
++    }
++  }
++
++  for (int y = 0; y < scaled_height; ++y) {
++    uint8* dest_pixel = rgb_buf + y * rgb_pitch;
++    int scaled_y = (y * height / scaled_height);
++    const uint8* y_ptr = y_buf + scaled_y * y_pitch;
++    const uint8* u_ptr = u_buf + (scaled_y >> y_shift) * uv_pitch;
++    const uint8* v_ptr = v_buf + (scaled_y >> y_shift) * uv_pitch;
++
++#if defined(_MSC_VER) && defined(_M_IX86)
++    if (scaled_width == (width * 2)) {
++      DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                          dest_pixel, scaled_width);
++    } else if ((scaled_dx & 15) == 0) {  // Scaling by integer scale factor.
++      if (scaled_dx_uv == scaled_dx) {   // Not rotated.
++        if (scaled_dx == 16) {           // Not scaled
++          if (has_mmx)
++            FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                                     dest_pixel, scaled_width);
++          else
++            FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                                      dest_pixel, scaled_width, x_shift);
++        } else {  // Simple scale down. ie half
++          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                               dest_pixel, scaled_width, scaled_dx >> 4);
++        }
++      } else {
++        RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                                   dest_pixel, scaled_width,
++                                   scaled_dx >> 4, scaled_dx_uv >> 4);
++      }
++#else
++    if (scaled_dx == 16) {           // Not scaled
++      if (has_mmx)
++        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                                 dest_pixel, scaled_width);
++      else
++        FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                                   dest_pixel, scaled_width, x_shift);
++#endif
++    } else {
++      if (has_mmx) 
++        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                           dest_pixel, scaled_width, scaled_dx);
++      else
++        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                             dest_pixel, scaled_width, scaled_dx, x_shift);
++
++    }  
++  }
++
++  // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
++  if (has_mmx)
++    EMMS();
++}
++
+ }  // namespace gfx
+ }  // namespace mozilla
+diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
+index c0b678d..a7e5b68 100644
+--- a/gfx/ycbcr/yuv_convert.h
++++ b/gfx/ycbcr/yuv_convert.h
+@@ -15,27 +15,56 @@ namespace gfx {
+ // Type of YUV surface.
+ // The value of these enums matter as they are used to shift vertical indices.
+ enum YUVType {
+   YV12 = 0,           // YV12 is half width and half height chroma channels.
+   YV16 = 1,           // YV16 is half width and full height chroma channels.
+   YV24 = 2            // YV24 is full width and full height chroma channels.
+ };
+ 
++// Mirror means flip the image horizontally, as in looking in a mirror.
++// Rotate happens after mirroring.
++enum Rotate {
++  ROTATE_0,           // Rotation off.
++  ROTATE_90,          // Rotate clockwise.
++  ROTATE_180,         // Rotate upside down.
++  ROTATE_270,         // Rotate counter clockwise.
++  MIRROR_ROTATE_0,    // Mirror horizontally.
++  MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
++  MIRROR_ROTATE_180,  // Mirror vertically.
++  MIRROR_ROTATE_270   // Transpose.
++};
++
+ // Convert a frame of YUV to 32 bit ARGB.
+ // Pass in YV16/YV12 depending on source format
+ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
+                                   const uint8* uplane,
+                                   const uint8* vplane,
+                                   uint8* rgbframe,
+                                   int pic_x,
+                                   int pic_y,
+                                   int pic_width,
+                                   int pic_height,
+                                   int ystride,
+                                   int uvstride,
+                                   int rgbstride,
+                                   YUVType yuv_type);
+ 
++// Scale a frame of YUV to 32 bit ARGB.
++// Supports rotation and mirroring.
++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
++                                const uint8* uplane,
++                                const uint8* vplane,
++                                uint8* rgbframe,
++                                int frame_width,
++                                int frame_height,
++                                int scaled_width,
++                                int scaled_height,
++                                int ystride,
++                                int uvstride,
++                                int rgbstride,
++                                YUVType yuv_type,
++                                Rotate view_rotate);
++
+ }  // namespace gfx
+ }  // namespace mozilla
+ 
+ #endif  // MEDIA_BASE_YUV_CONVERT_H_
+diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
+index 8519008..96969ec 100644
+--- a/gfx/ycbcr/yuv_row.h
++++ b/gfx/ycbcr/yuv_row.h
+@@ -24,16 +24,64 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+                                 unsigned int x_shift);
+ 
+ 
++// Can do 1x, half size or any scale down by an integer amount.
++// Step can be negative (mirroring, rotate 180).
++// This is the third fastest of the scalers.
++void ConvertYUVToRGB32Row(const uint8* y_buf,
++                          const uint8* u_buf,
++                          const uint8* v_buf,
++                          uint8* rgb_buf,
++                          int width,
++                          int step);
++
++// Rotate is like Convert, but applies different step to Y versus U and V.
++// This allows rotation by 90 or 270, by stepping by stride.
++// This is the forth fastest of the scalers.
++void RotateConvertYUVToRGB32Row(const uint8* y_buf,
++                                const uint8* u_buf,
++                                const uint8* v_buf,
++                                uint8* rgb_buf,
++                                int width,
++                                int ystep,
++                                int uvstep);
++
++// Doubler does 4 pixels at a time.  Each pixel is replicated.
++// This is the fastest of the scalers.
++void DoubleYUVToRGB32Row(const uint8* y_buf,
++                         const uint8* u_buf,
++                         const uint8* v_buf,
++                         uint8* rgb_buf,
++                         int width);
++
++// Handles arbitrary scaling up or down.
++// Mirroring is supported, but not 90 or 270 degree rotation.
++// Chroma is under sampled every 2 pixels for performance.
++// This is the slowest of the scalers.
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx);
++
++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
++                          const uint8* u_buf,
++                          const uint8* v_buf,
++                          uint8* rgb_buf,
++                          int width,
++                          int scaled_dx,
++                          unsigned int x_shift);
++
+ }  // extern "C"
+ 
+ // x64 uses MMX2 (SSE) so emms is not required.
+ #if defined(ARCH_CPU_X86)
+ #if defined(_MSC_VER)
+ #define EMMS() __asm emms
+ #else
+ #define EMMS() asm("emms")
+diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
+index b5c0018..49eced2 100644
+--- a/gfx/ycbcr/yuv_row_c.cpp
++++ b/gfx/ycbcr/yuv_row_c.cpp
+@@ -172,10 +172,31 @@ void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+         v = v_buf[x + 1];
+       }
+       YuvPixel(y1, u, v, rgb_buf + 4);
+     }
+     rgb_buf += 8;  // Advance 2 pixels.
+   }
+ }
+ 
++// 28.4 fixed point is used.  A shift by 4 isolates the integer.
++// A shift by 5 is used to further subsample the chrominence channels.
++// & 15 isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
++// for 1/4 pixel accurate interpolation.
++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx,
++                        unsigned int x_shift) {
++  int scaled_x = 0;
++  for (int x = 0; x < width; ++x) {
++    uint8 u = u_buf[scaled_x >> (4 + x_shift)];
++    uint8 v = v_buf[scaled_x >> (4 + x_shift)];
++    uint8 y0 = y_buf[scaled_x >> 4];
++    YuvPixel(y0, u, v, rgb_buf);
++    rgb_buf += 4;
++    scaled_x += scaled_dx;
++  }
++}
+ }  // extern "C"
+ 
+diff --git a/gfx/ycbcr/yuv_row_linux.cpp b/gfx/ycbcr/yuv_row_linux.cpp
+index 9f7625c..bff02b3 100644
+--- a/gfx/ycbcr/yuv_row_linux.cpp
++++ b/gfx/ycbcr/yuv_row_linux.cpp
+@@ -16,16 +16,24 @@ extern "C" {
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+ }
+  
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
++}
+ #else
+ 
+ #define RGBY(i) { \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   0 \
+ }
+@@ -365,16 +373,86 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
+     "r"(u_buf),  // %1
+     "r"(v_buf),  // %2
+     "r"(rgb_buf),  // %3
+     "r"(width),  // %4
+     "r" (kCoefficientsRgbY)  // %5
+   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+ }
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
++                        const uint8* u_buf,  // rsi
++                        const uint8* v_buf,  // rdx
++                        uint8* rgb_buf,      // rcx
++                        int width,           // r8
++                        int scaled_dx) {     // r9
++  asm(
++  "xor    %%r11,%%r11\n"
++  "sub    $0x2,%4\n"
++  "js     scalenext\n"
++
++"scaleloop:"
++  "mov    %%r11,%%r10\n"
++  "sar    $0x5,%%r10\n"
++  "movzb  (%1,%%r10,1),%%rax\n"
++  "movq   2048(%5,%%rax,8),%%xmm0\n"
++  "movzb  (%2,%%r10,1),%%rax\n"
++  "movq   4096(%5,%%rax,8),%%xmm1\n"
++  "lea    (%%r11,%6),%%r10\n"
++  "sar    $0x4,%%r11\n"
++  "movzb  (%0,%%r11,1),%%rax\n"
++  "paddsw %%xmm1,%%xmm0\n"
++  "movq   (%5,%%rax,8),%%xmm1\n"
++  "lea    (%%r10,%6),%%r11\n"
++  "sar    $0x4,%%r10\n"
++  "movzb  (%0,%%r10,1),%%rax\n"
++  "movq   (%5,%%rax,8),%%xmm2\n"
++  "paddsw %%xmm0,%%xmm1\n"
++  "paddsw %%xmm0,%%xmm2\n"
++  "shufps $0x44,%%xmm2,%%xmm1\n"
++  "psraw  $0x6,%%xmm1\n"
++  "packuswb %%xmm1,%%xmm1\n"
++  "movq   %%xmm1,0x0(%3)\n"
++  "add    $0x8,%3\n"
++  "sub    $0x2,%4\n"
++  "jns    scaleloop\n"
++
++"scalenext:"
++  "add    $0x1,%4\n"
++  "js     scaledone\n"
++
++  "mov    %%r11,%%r10\n"
++  "sar    $0x5,%%r10\n"
++  "movzb  (%1,%%r10,1),%%rax\n"
++  "movq   2048(%5,%%rax,8),%%xmm0\n"
++  "movzb  (%2,%%r10,1),%%rax\n"
++  "movq   4096(%5,%%rax,8),%%xmm1\n"
++  "paddsw %%xmm1,%%xmm0\n"
++  "sar    $0x4,%%r11\n"
++  "movzb  (%0,%%r11,1),%%rax\n"
++  "movq   (%5,%%rax,8),%%xmm1\n"
++  "paddsw %%xmm0,%%xmm1\n"
++  "psraw  $0x6,%%xmm1\n"
++  "packuswb %%xmm1,%%xmm1\n"
++  "movd   %%xmm1,0x0(%3)\n"
++
++"scaledone:"
++  :
++  : "r"(y_buf),  // %0
++    "r"(u_buf),  // %1
++    "r"(v_buf),  // %2
++    "r"(rgb_buf),  // %3
++    "r"(width),  // %4
++    "r" (kCoefficientsRgbY),  // %5
++    "r"(static_cast<long>(scaled_dx))  // %6
++  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
++);
++}
++
+ #endif // __SUNPRO_CC
+ 
+ #else // ARCH_CPU_X86_64
+ 
+ #ifdef __SUNPRO_CC
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+@@ -493,13 +571,87 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+   "packuswb %mm1,%mm1\n"
+   "movd   %mm1,0x0(%ebp)\n"
+ "2:"
+   "popa\n"
+   "ret\n"
+   ".previous\n"
+ );
+ 
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx);
++
++  asm(
++  ".global ScaleYUVToRGB32Row\n"
++"ScaleYUVToRGB32Row:\n"
++  "pusha\n"
++  "mov    0x24(%esp),%edx\n"
++  "mov    0x28(%esp),%edi\n"
++  "mov    0x2c(%esp),%esi\n"
++  "mov    0x30(%esp),%ebp\n"
++  "mov    0x34(%esp),%ecx\n"
++  "xor    %ebx,%ebx\n"
++  "jmp    scaleend\n"
++
++"scaleloop:"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
++  "paddsw %mm0,%mm1\n"
++  "paddsw %mm0,%mm2\n"
++  "psraw  $0x6,%mm1\n"
++  "psraw  $0x6,%mm2\n"
++  "packuswb %mm2,%mm1\n"
++  "movntq %mm1,0x0(%ebp)\n"
++  "add    $0x8,%ebp\n"
++"scaleend:"
++  "sub    $0x2,%ecx\n"
++  "jns    scaleloop\n"
++
++  "and    $0x1,%ecx\n"
++  "je     scaledone\n"
++
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
++  "paddsw %mm0,%mm1\n"
++  "psraw  $0x6,%mm1\n"
++  "packuswb %mm1,%mm1\n"
++  "movd   %mm1,0x0(%ebp)\n"
++
++"scaledone:"
++  "popa\n"
++  "ret\n"
++);
++
+ #endif // __SUNPRO_CC
+ #endif // ARCH_CPU_X86_64
+ #endif // !ARCH_CPU_X86_FAMILY
+ }  // extern "C"
+ 
+diff --git a/gfx/ycbcr/yuv_row_mac.cpp b/gfx/ycbcr/yuv_row_mac.cpp
+index a1d0058..5acf825 100644
+--- a/gfx/ycbcr/yuv_row_mac.cpp
++++ b/gfx/ycbcr/yuv_row_mac.cpp
+@@ -16,16 +16,24 @@ extern "C" {
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+ }
+  
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
++}
+ #else
+ 
+ #define RGBY(i) { \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   0 \
+ }
+@@ -313,11 +321,96 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   MacConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
+                           &kCoefficientsRgbY[0][0]);
+ }
+ 
++extern void MacScaleYUVToRGB32Row(const uint8* y_buf,
++                               const uint8* u_buf,
++                               const uint8* v_buf,
++                               uint8* rgb_buf,
++                               int width,
++                               int scaled_dx,
++                               int16 *kCoefficientsRgbY);
++
++  __asm__(
++"_MacScaleYUVToRGB32Row:\n"
++  "pusha\n"
++  "mov    0x24(%esp),%edx\n"
++  "mov    0x28(%esp),%edi\n"
++  "mov    0x2c(%esp),%esi\n"
++  "mov    0x30(%esp),%ebp\n"
++  "mov    0x3c(%esp),%ecx\n"
++  "xor    %ebx,%ebx\n"
++  "jmp    Lscaleend\n"
++
++"Lscaleloop:"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   2048(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw 4096(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   0(%ecx,%eax,8),%mm1\n"
++  "mov    %ebx,%eax\n"
++  "add    0x38(%esp),%ebx\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   0(%ecx,%eax,8),%mm2\n"
++  "paddsw %mm0,%mm1\n"
++  "paddsw %mm0,%mm2\n"
++  "psraw  $0x6,%mm1\n"
++  "psraw  $0x6,%mm2\n"
++  "packuswb %mm2,%mm1\n"
++  "movntq %mm1,0x0(%ebp)\n"
++  "add    $0x8,%ebp\n"
++"Lscaleend:"
++  "sub    $0x2,0x34(%esp)\n"
++  "jns    Lscaleloop\n"
++
++  "and    $0x1,0x34(%esp)\n"
++  "je     Lscaledone\n"
++
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%edi,%eax,1),%eax\n"
++  "movq   2048(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x5,%eax\n"
++  "movzbl (%esi,%eax,1),%eax\n"
++  "paddsw 4096(%ecx,%eax,8),%mm0\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x4,%eax\n"
++  "movzbl (%edx,%eax,1),%eax\n"
++  "movq   0(%ecx,%eax,8),%mm1\n"
++  "paddsw %mm0,%mm1\n"
++  "psraw  $0x6,%mm1\n"
++  "packuswb %mm1,%mm1\n"
++  "movd   %mm1,0x0(%ebp)\n"
++
++"Lscaledone:"
++  "popa\n"
++  "ret\n"
++);
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++
++  MacScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx,
++                        &kCoefficientsRgbY[0][0]);
++}
++
+ #endif // ARCH_CPU_PPC || ARCH_CPU_64_BITS
+ }  // extern "C"
+ 
+diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
+index 699ac77..a1700fc 100644
+--- a/gfx/ycbcr/yuv_row_win.cpp
++++ b/gfx/ycbcr/yuv_row_win.cpp
+@@ -11,17 +11,26 @@ extern "C" {
+ // PPC implementation uses C fallback
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+ }
+- 
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int scaled_dx) {
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
++}
++
+ #else
+ 
+ 
+ #define RGBY(i) { \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+   0 \
+@@ -307,11 +316,280 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+     movd      [ebp], mm1
+  convertdone :
+ 
+     popad
+     ret
+   }
+ }
+ 
++__declspec(naked)
++void ConvertYUVToRGB32Row(const uint8* y_buf,
++                          const uint8* u_buf,
++                          const uint8* v_buf,
++                          uint8* rgb_buf,
++                          int width,
++                          int step) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    mov       ebx, [esp + 32 + 24]  // step
++    jmp       wend
++
++ wloop :
++    movzx     eax, byte ptr [edi]
++    add       edi, ebx
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    add       esi, ebx
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    add       edx, ebx
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    add       edx, ebx
++    movq      mm2, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    paddsw    mm2, mm0
++    psraw     mm1, 6
++    psraw     mm2, 6
++    packuswb  mm1, mm2
++    movntq    [ebp], mm1
++    add       ebp, 8
++ wend :
++    sub       ecx, 2
++    jns       wloop
++
++    and       ecx, 1  // odd number of pixels?
++    jz        wdone
++
++    movzx     eax, byte ptr [edi]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    movd      [ebp], mm1
++ wdone :
++
++    popad
++    ret
++  }
++}
++
++__declspec(naked)
++void RotateConvertYUVToRGB32Row(const uint8* y_buf,
++                                const uint8* u_buf,
++                                const uint8* v_buf,
++                                uint8* rgb_buf,
++                                int width,
++                                int ystep,
++                                int uvstep) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    jmp       wend
++
++ wloop :
++    movzx     eax, byte ptr [edi]
++    mov       ebx, [esp + 32 + 28]  // uvstep
++    add       edi, ebx
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    add       esi, ebx
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    mov       ebx, [esp + 32 + 24]  // ystep
++    add       edx, ebx
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    add       edx, ebx
++    movq      mm2, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    paddsw    mm2, mm0
++    psraw     mm1, 6
++    psraw     mm2, 6
++    packuswb  mm1, mm2
++    movntq    [ebp], mm1
++    add       ebp, 8
++ wend :
++    sub       ecx, 2
++    jns       wloop
++
++    and       ecx, 1  // odd number of pixels?
++    jz        wdone
++
++    movzx     eax, byte ptr [edi]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    movd      [ebp], mm1
++ wdone :
++
++    popad
++    ret
++  }
++}
++
++__declspec(naked)
++void DoubleYUVToRGB32Row(const uint8* y_buf,
++                         const uint8* u_buf,
++                         const uint8* v_buf,
++                         uint8* rgb_buf,
++                         int width) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    jmp       wend
++
++ wloop :
++    movzx     eax, byte ptr [edi]
++    add       edi, 1
++    movzx     ebx, byte ptr [esi]
++    add       esi, 1
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    punpckldq mm1, mm1
++    movntq    [ebp], mm1
++
++    movzx     ebx, byte ptr [edx + 1]
++    add       edx, 2
++    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
++    psraw     mm0, 6
++    packuswb  mm0, mm0
++    punpckldq mm0, mm0
++    movntq    [ebp+8], mm0
++    add       ebp, 16
++ wend :
++    sub       ecx, 4
++    jns       wloop
++
++    add       ecx, 4
++    jz        wdone
++
++    movzx     eax, byte ptr [edi]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    movzx     eax, byte ptr [esi]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    movzx     eax, byte ptr [edx]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    jmp       wend1
++
++ wloop1 :
++    movd      [ebp], mm1
++    add       ebp, 4
++ wend1 :
++    sub       ecx, 1
++    jns       wloop1
++ wdone :
++    popad
++    ret
++  }
++}
++
++// This version does general purpose scaling by any amount, up or down.
++// The only thing it can not do it rotation by 90 or 270.
++// For performance the chroma is under sampled, reducing cost of a 3x
++// 1080p scale from 8.4 ms to 5.4 ms.
++__declspec(naked)
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int dx) {
++  __asm {
++    pushad
++    mov       edx, [esp + 32 + 4]   // Y
++    mov       edi, [esp + 32 + 8]   // U
++    mov       esi, [esp + 32 + 12]  // V
++    mov       ebp, [esp + 32 + 16]  // rgb
++    mov       ecx, [esp + 32 + 20]  // width
++    xor       ebx, ebx              // x
++    jmp       scaleend
++
++ scaleloop :
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [edi + eax]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [esi + eax]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    mov       eax, ebx
++    add       ebx, [esp + 32 + 24]  // x += dx
++    sar       eax, 4
++    movzx     eax, byte ptr [edx + eax]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    mov       eax, ebx
++    add       ebx, [esp + 32 + 24]  // x += dx
++    sar       eax, 4
++    movzx     eax, byte ptr [edx + eax]
++    movq      mm2, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    paddsw    mm2, mm0
++    psraw     mm1, 6
++    psraw     mm2, 6
++    packuswb  mm1, mm2
++    movntq    [ebp], mm1
++    add       ebp, 8
++ scaleend :
++    sub       ecx, 2
++    jns       scaleloop
++
++    and       ecx, 1  // odd number of pixels?
++    jz        scaledone
++
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [edi + eax]
++    movq      mm0, [kCoefficientsRgbU + 8 * eax]
++    mov       eax, ebx
++    sar       eax, 5
++    movzx     eax, byte ptr [esi + eax]
++    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
++    mov       eax, ebx
++    sar       eax, 4
++    movzx     eax, byte ptr [edx + eax]
++    movq      mm1, [kCoefficientsRgbY + 8 * eax]
++    paddsw    mm1, mm0
++    psraw     mm1, 6
++    packuswb  mm1, mm1
++    movd      [ebp], mm1
++
++ scaledone :
++    popad
++    ret
++  }
++}
++
+ #endif // ARCH_CPU_64_BITS
+ }  // extern "C"
+ 
--- a/gfx/ycbcr/update.sh
+++ b/gfx/ycbcr/update.sh
@@ -10,8 +10,9 @@ patch -p3 <convert.patch
 patch -p3 <picture_region.patch
 patch -p3 <remove_scale.patch
 patch -p3 <export.patch
 patch -p3 <win64_mac64.patch
 patch -p3 <yv24.patch
 patch -p3 <row_c_fix.patch
 patch -p3 <bug572034_mac_64bit.patch
 patch -p3 <bug577645_movntq.patch
+patch -p3 <add_scale.patch
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -84,10 +84,139 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const 
 
 #ifdef ARCH_CPU_X86_FAMILY
   // SSE used for FastConvertYUVToRGB32Row requires emms instruction.
   if (has_sse)
     EMMS();
 #endif
 }
 
+// Scale a frame of YUV to 32 bit ARGB.
+NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int height,
+                                int scaled_width,
+                                int scaled_height,
+                                int y_pitch,
+                                int uv_pitch,
+                                int rgb_pitch,
+                                YUVType yuv_type,
+                                Rotate view_rotate) {
+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
+  bool has_mmx = supports_mmx();
+  // Diagram showing origin and direction of source sampling.
+  // ->0   4<-
+  // 7       3
+  //
+  // 6       5
+  // ->1   2<-
+  // Rotations that start at right side of image.
+  if ((view_rotate == ROTATE_180) ||
+      (view_rotate == ROTATE_270) ||
+      (view_rotate == MIRROR_ROTATE_0) ||
+      (view_rotate == MIRROR_ROTATE_90)) {
+    y_buf += width - 1;
+    u_buf += width / 2 - 1;
+    v_buf += width / 2 - 1;
+    width = -width;
+  }
+  // Rotations that start at bottom of image.
+  if ((view_rotate == ROTATE_90) ||
+      (view_rotate == ROTATE_180) ||
+      (view_rotate == MIRROR_ROTATE_90) ||
+      (view_rotate == MIRROR_ROTATE_180)) {
+    y_buf += (height - 1) * y_pitch;
+    u_buf += ((height >> y_shift) - 1) * uv_pitch;
+    v_buf += ((height >> y_shift) - 1) * uv_pitch;
+    height = -height;
+  }
+
+  // Handle zero sized destination.
+  if (scaled_width == 0 || scaled_height == 0)
+    return;
+  int scaled_dx = width * 16 / scaled_width;
+  int scaled_dy = height * 16 / scaled_height;
+
+  int scaled_dx_uv = scaled_dx;
+
+  if ((view_rotate == ROTATE_90) ||
+      (view_rotate == ROTATE_270)) {
+    int tmp = scaled_height;
+    scaled_height = scaled_width;
+    scaled_width = tmp;
+    tmp = height;
+    height = width;
+    width = tmp;
+    int original_dx = scaled_dx;
+    int original_dy = scaled_dy;
+    scaled_dx = ((original_dy >> 4) * y_pitch) << 4;
+    scaled_dx_uv = ((original_dy >> 4) * uv_pitch) << 4;
+    scaled_dy = original_dx;
+    if (view_rotate == ROTATE_90) {
+      y_pitch = -1;
+      uv_pitch = -1;
+      height = -height;
+    } else {
+      y_pitch = 1;
+      uv_pitch = 1;
+    }
+  }
+
+  for (int y = 0; y < scaled_height; ++y) {
+    uint8* dest_pixel = rgb_buf + y * rgb_pitch;
+    int scaled_y = (y * height / scaled_height);
+    const uint8* y_ptr = y_buf + scaled_y * y_pitch;
+    const uint8* u_ptr = u_buf + (scaled_y >> y_shift) * uv_pitch;
+    const uint8* v_ptr = v_buf + (scaled_y >> y_shift) * uv_pitch;
+
+#if defined(_MSC_VER) && defined(_M_IX86)
+    if (scaled_width == (width * 2)) {
+      DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                          dest_pixel, scaled_width);
+    } else if ((scaled_dx & 15) == 0) {  // Scaling by integer scale factor.
+      if (scaled_dx_uv == scaled_dx) {   // Not rotated.
+        if (scaled_dx == 16) {           // Not scaled
+          if (has_mmx)
+            FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                     dest_pixel, scaled_width);
+          else
+            FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                                      dest_pixel, scaled_width, x_shift);
+        } else {  // Simple scale down. ie half
+          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                               dest_pixel, scaled_width, scaled_dx >> 4);
+        }
+      } else {
+        RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, scaled_width,
+                                   scaled_dx >> 4, scaled_dx_uv >> 4);
+      }
+#else
+    if (scaled_dx == 16) {           // Not scaled
+      if (has_mmx)
+        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                 dest_pixel, scaled_width);
+      else
+        FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, scaled_width, x_shift);
+#endif
+    } else {
+      if (has_mmx) 
+        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                           dest_pixel, scaled_width, scaled_dx);
+      else
+        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                             dest_pixel, scaled_width, scaled_dx, x_shift);
+
+    }  
+  }
+
+  // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+  if (has_mmx)
+    EMMS();
+}
+
 }  // namespace gfx
 }  // namespace mozilla
--- a/gfx/ycbcr/yuv_convert.h
+++ b/gfx/ycbcr/yuv_convert.h
@@ -15,27 +15,56 @@ namespace gfx {
 // Type of YUV surface.
 // The value of these enums matter as they are used to shift vertical indices.
 enum YUVType {
   YV12 = 0,           // YV12 is half width and half height chroma channels.
   YV16 = 1,           // YV16 is half width and full height chroma channels.
   YV24 = 2            // YV24 is full width and full height chroma channels.
 };
 
+// Mirror means flip the image horizontally, as in looking in a mirror.
+// Rotate happens after mirroring.
+enum Rotate {
+  ROTATE_0,           // Rotation off.
+  ROTATE_90,          // Rotate clockwise.
+  ROTATE_180,         // Rotate upside down.
+  ROTATE_270,         // Rotate counter clockwise.
+  MIRROR_ROTATE_0,    // Mirror horizontally.
+  MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
+  MIRROR_ROTATE_180,  // Mirror vertically.
+  MIRROR_ROTATE_270   // Transpose.
+};
+
 // Convert a frame of YUV to 32 bit ARGB.
 // Pass in YV16/YV12 depending on source format
 NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
                                   const uint8* uplane,
                                   const uint8* vplane,
                                   uint8* rgbframe,
                                   int pic_x,
                                   int pic_y,
                                   int pic_width,
                                   int pic_height,
                                   int ystride,
                                   int uvstride,
                                   int rgbstride,
                                   YUVType yuv_type);
 
+// Scale a frame of YUV to 32 bit ARGB.
+// Supports rotation and mirroring.
+NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
+                                const uint8* uplane,
+                                const uint8* vplane,
+                                uint8* rgbframe,
+                                int frame_width,
+                                int frame_height,
+                                int scaled_width,
+                                int scaled_height,
+                                int ystride,
+                                int uvstride,
+                                int rgbstride,
+                                YUVType yuv_type,
+                                Rotate view_rotate);
+
 }  // namespace gfx
 }  // namespace mozilla
 
 #endif  // MEDIA_BASE_YUV_CONVERT_H_
--- a/gfx/ycbcr/yuv_row.h
+++ b/gfx/ycbcr/yuv_row.h
@@ -24,16 +24,64 @@ void FastConvertYUVToRGB32Row(const uint
 void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width,
                                 unsigned int x_shift);
 
 
+// Can do 1x, half size or any scale down by an integer amount.
+// Step can be negative (mirroring, rotate 180).
+// This is the third fastest of the scalers.
+void ConvertYUVToRGB32Row(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int step);
+
+// Rotate is like Convert, but applies different step to Y versus U and V.
+// This allows rotation by 90 or 270, by stepping by stride.
+// This is the forth fastest of the scalers.
+void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int ystep,
+                                int uvstep);
+
+// Doubler does 4 pixels at a time.  Each pixel is replicated.
+// This is the fastest of the scalers.
+void DoubleYUVToRGB32Row(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width);
+
+// Handles arbitrary scaling up or down.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+// This is the slowest of the scalers.
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int scaled_dx,
+                          unsigned int x_shift);
+
 }  // extern "C"
 
 // x64 uses MMX2 (SSE) so emms is not required.
 #if defined(ARCH_CPU_X86)
 #if defined(_MSC_VER)
 #define EMMS() __asm emms
 #else
 #define EMMS() asm("emms")
--- a/gfx/ycbcr/yuv_row_c.cpp
+++ b/gfx/ycbcr/yuv_row_c.cpp
@@ -172,10 +172,31 @@ void FastConvertYUVToRGB32Row_C(const ui
         v = v_buf[x + 1];
       }
       YuvPixel(y1, u, v, rgb_buf + 4);
     }
     rgb_buf += 8;  // Advance 2 pixels.
   }
 }
 
+// 28.4 fixed point is used.  A shift by 4 isolates the integer.
+// A shift by 5 is used to further subsample the chrominence channels.
+// & 15 isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
+// for 1/4 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx,
+                        unsigned int x_shift) {
+  int scaled_x = 0;
+  for (int x = 0; x < width; ++x) {
+    uint8 u = u_buf[scaled_x >> (4 + x_shift)];
+    uint8 v = v_buf[scaled_x >> (4 + x_shift)];
+    uint8 y0 = y_buf[scaled_x >> 4];
+    YuvPixel(y0, u, v, rgb_buf);
+    rgb_buf += 4;
+    scaled_x += scaled_dx;
+  }
+}
 }  // extern "C"
 
--- a/gfx/ycbcr/yuv_row_linux.cpp
+++ b/gfx/ycbcr/yuv_row_linux.cpp
@@ -16,16 +16,24 @@ extern "C" {
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
  
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
+}
 #else
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
 }
@@ -365,16 +373,86 @@ void FastConvertYUVToRGB32Row(const uint
     "r"(u_buf),  // %1
     "r"(v_buf),  // %2
     "r"(rgb_buf),  // %3
     "r"(width),  // %4
     "r" (kCoefficientsRgbY)  // %5
   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
 );
 }
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
+                        const uint8* u_buf,  // rsi
+                        const uint8* v_buf,  // rdx
+                        uint8* rgb_buf,      // rcx
+                        int width,           // r8
+                        int scaled_dx) {     // r9
+  asm(
+  "xor    %%r11,%%r11\n"
+  "sub    $0x2,%4\n"
+  "js     scalenext\n"
+
+"scaleloop:"
+  "mov    %%r11,%%r10\n"
+  "sar    $0x5,%%r10\n"
+  "movzb  (%1,%%r10,1),%%rax\n"
+  "movq   2048(%5,%%rax,8),%%xmm0\n"
+  "movzb  (%2,%%r10,1),%%rax\n"
+  "movq   4096(%5,%%rax,8),%%xmm1\n"
+  "lea    (%%r11,%6),%%r10\n"
+  "sar    $0x4,%%r11\n"
+  "movzb  (%0,%%r11,1),%%rax\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%rax,8),%%xmm1\n"
+  "lea    (%%r10,%6),%%r11\n"
+  "sar    $0x4,%%r10\n"
+  "movzb  (%0,%%r10,1),%%rax\n"
+  "movq   (%5,%%rax,8),%%xmm2\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "shufps $0x44,%%xmm2,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movq   %%xmm1,0x0(%3)\n"
+  "add    $0x8,%3\n"
+  "sub    $0x2,%4\n"
+  "jns    scaleloop\n"
+
+"scalenext:"
+  "add    $0x1,%4\n"
+  "js     scaledone\n"
+
+  "mov    %%r11,%%r10\n"
+  "sar    $0x5,%%r10\n"
+  "movzb  (%1,%%r10,1),%%rax\n"
+  "movq   2048(%5,%%rax,8),%%xmm0\n"
+  "movzb  (%2,%%r10,1),%%rax\n"
+  "movq   4096(%5,%%rax,8),%%xmm1\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "sar    $0x4,%%r11\n"
+  "movzb  (%0,%%r11,1),%%rax\n"
+  "movq   (%5,%%rax,8),%%xmm1\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movd   %%xmm1,0x0(%3)\n"
+
+"scaledone:"
+  :
+  : "r"(y_buf),  // %0
+    "r"(u_buf),  // %1
+    "r"(v_buf),  // %2
+    "r"(rgb_buf),  // %3
+    "r"(width),  // %4
+    "r" (kCoefficientsRgbY),  // %5
+    "r"(static_cast<long>(scaled_dx))  // %6
+  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
 #endif // __SUNPRO_CC
 
 #else // ARCH_CPU_X86_64
 
 #ifdef __SUNPRO_CC
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
@@ -493,13 +571,87 @@ void FastConvertYUVToRGB32Row(const uint
   "packuswb %mm1,%mm1\n"
   "movd   %mm1,0x0(%ebp)\n"
 "2:"
   "popa\n"
   "ret\n"
   ".previous\n"
 );
 
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx);
+
+  asm(
+  ".global ScaleYUVToRGB32Row\n"
+"ScaleYUVToRGB32Row:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+  "xor    %ebx,%ebx\n"
+  "jmp    scaleend\n"
+
+"scaleloop:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"scaleend:"
+  "sub    $0x2,%ecx\n"
+  "jns    scaleloop\n"
+
+  "and    $0x1,%ecx\n"
+  "je     scaledone\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+
+"scaledone:"
+  "popa\n"
+  "ret\n"
+);
+
 #endif // __SUNPRO_CC
 #endif // ARCH_CPU_X86_64
 #endif // !ARCH_CPU_X86_FAMILY
 }  // extern "C"
 
--- a/gfx/ycbcr/yuv_row_mac.cpp
+++ b/gfx/ycbcr/yuv_row_mac.cpp
@@ -16,16 +16,24 @@ extern "C" {
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
  
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
+}
 #else
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
 }
@@ -313,11 +321,96 @@ void FastConvertYUVToRGB32Row(const uint
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   MacConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
                           &kCoefficientsRgbY[0][0]);
 }
 
+extern void MacScaleYUVToRGB32Row(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width,
+                               int scaled_dx,
+                               int16 *kCoefficientsRgbY);
+
+  __asm__(
+"_MacScaleYUVToRGB32Row:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x3c(%esp),%ecx\n"
+  "xor    %ebx,%ebx\n"
+  "jmp    Lscaleend\n"
+
+"Lscaleloop:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"Lscaleend:"
+  "sub    $0x2,0x34(%esp)\n"
+  "jns    Lscaleloop\n"
+
+  "and    $0x1,0x34(%esp)\n"
+  "je     Lscaledone\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x5,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x4,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+
+"Lscaledone:"
+  "popa\n"
+  "ret\n"
+);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+
+  MacScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx,
+                        &kCoefficientsRgbY[0][0]);
+}
+
 #endif // ARCH_CPU_PPC || ARCH_CPU_64_BITS
 }  // extern "C"
 
--- a/gfx/ycbcr/yuv_row_win.cpp
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -11,17 +11,26 @@ extern "C" {
 // PPC implementation uses C fallback
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
- 
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int scaled_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1);
+}
+
 #else
 
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
@@ -307,11 +316,280 @@ void FastConvertYUVToRGB32Row(const uint
     movd      [ebp], mm1
  convertdone :
 
     popad
     ret
   }
 }
 
+__declspec(naked)
+void ConvertYUVToRGB32Row(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int step) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    mov       ebx, [esp + 32 + 24]  // step
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, ebx
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    add       esi, ebx
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ wend :
+    sub       ecx, 2
+    jns       wloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ wdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int ystep,
+                                int uvstep) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    mov       ebx, [esp + 32 + 28]  // uvstep
+    add       edi, ebx
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    add       esi, ebx
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    mov       ebx, [esp + 32 + 24]  // ystep
+    add       edx, ebx
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ wend :
+    sub       ecx, 2
+    jns       wloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ wdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void DoubleYUVToRGB32Row(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, 1
+    movzx     ebx, byte ptr [esi]
+    add       esi, 1
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    punpckldq mm1, mm1
+    movntq    [ebp], mm1
+
+    movzx     ebx, byte ptr [edx + 1]
+    add       edx, 2
+    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
+    psraw     mm0, 6
+    packuswb  mm0, mm0
+    punpckldq mm0, mm0
+    movntq    [ebp+8], mm0
+    add       ebp, 16
+ wend :
+    sub       ecx, 4
+    jns       wloop
+
+    add       ecx, 4
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    jmp       wend1
+
+ wloop1 :
+    movd      [ebp], mm1
+    add       ebp, 4
+ wend1 :
+    sub       ecx, 1
+    jns       wloop1
+ wdone :
+    popad
+    ret
+  }
+}
+
+// This version does general purpose scaling by any amount, up or down.
+// The only thing it can not do it rotation by 90 or 270.
+// For performance the chroma is under sampled, reducing cost of a 3x
+// 1080p scale from 8.4 ms to 5.4 ms.
+__declspec(naked)
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int dx) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    xor       ebx, ebx              // x
+    jmp       scaleend
+
+ scaleloop :
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [edi + eax]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [esi + eax]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]  // x += dx
+    sar       eax, 4
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]  // x += dx
+    sar       eax, 4
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ scaleend :
+    sub       ecx, 2
+    jns       scaleloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        scaledone
+
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [edi + eax]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 5
+    movzx     eax, byte ptr [esi + eax]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 4
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+
+ scaledone :
+    popad
+    ret
+  }
+}
+
 #endif // ARCH_CPU_64_BITS
 }  // extern "C"
 
--- a/layout/generic/nsVideoFrame.cpp
+++ b/layout/generic/nsVideoFrame.cpp
@@ -246,16 +246,20 @@ nsVideoFrame::BuildLayer(nsDisplayListBu
   // the largest rectangle that fills our content-box and has the
   // correct aspect ratio.
   nsPresContext* presContext = PresContext();
   gfxRect r = gfxRect(presContext->AppUnitsToGfxUnits(area.x),
                       presContext->AppUnitsToGfxUnits(area.y),
                       presContext->AppUnitsToGfxUnits(area.width),
                       presContext->AppUnitsToGfxUnits(area.height));
   r = CorrectForAspectRatio(r, videoSize);
+  r.Round();
+  gfxIntSize scaleHint(static_cast<PRInt32>(r.Width()),
+                       static_cast<PRInt32>(r.Height()));
+  container->SetScaleHint(scaleHint);
 
   nsRefPtr<ImageLayer> layer = static_cast<ImageLayer*>
     (aBuilder->LayerBuilder()->GetLeafLayerFor(aBuilder, aManager, aItem));
   if (!layer) {
     layer = aManager->CreateImageLayer();
     if (!layer)
       return nsnull;
   }