Bug 1478269 - Remove ycbcr update script and patches. r=jrmuizel
authorMike Hommey <mh+mozilla@glandium.org>
Fri, 24 Aug 2018 10:46:14 +0900
changeset 488863 998cf7d22736ed2197fd018e1ee28a8257e3de96
parent 488862 7acaf7396b0dbe05b241eb41973f23d29e152462
child 488864 a69ed9d1f49c097a67d6f67bdc0e91d7f26e32b4
push id9734
push usershindli@mozilla.com
push dateThu, 30 Aug 2018 12:18:07 +0000
treeherdermozilla-beta@71c71ab3afae [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs1478269
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1478269 - Remove ycbcr update script and patches. r=jrmuizel ycbcr is dead upstream, and has been for almost as long as the code in the gecko tree hasn't been updated. Let's not pretend that we can actually run the update script and that having the patches separated matters, because there's no upstream to apply those patches to anymore. Update README accordingly. Differential Revision: https://phabricator.services.mozilla.com/D4198
gfx/ycbcr/QuellGccWarnings.patch
gfx/ycbcr/README
gfx/ycbcr/TypeFromSize.patch
gfx/ycbcr/clang-cl-workaround.patch
gfx/ycbcr/convert.patch
gfx/ycbcr/update.sh
gfx/ycbcr/win64.patch
deleted file mode 100644
--- a/gfx/ycbcr/QuellGccWarnings.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
---- a/gfx/ycbcr/yuv_convert.cpp
-+++ b/gfx/ycbcr/yuv_convert.cpp
-@@ -337,16 +337,17 @@ void ScaleYCbCrToRGB32(const uint* yplan
-                                          source_dx_uv >> kFractionBits);
-         }
-       }
-       else {
-         ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
-                              dest_pixel, width, source_dx);
-       }
- #else
-+      (void)source_dx_uv;
-       ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                          dest_pixel, width, source_dx);
- #endif
-     }
-   }
-   // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
-   if (has_mmx)
-     EMMS();
-diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
---- a/gfx/ycbcr/yuv_row.h
-+++ b/gfx/ycbcr/yuv_row.h
-@@ -129,14 +129,14 @@ extern SIMD_ALIGNED(int16 kCoefficientsR
- #if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
- #if defined(_MSC_VER)
- #define EMMS() __asm emms
- #pragma warning(disable: 4799)
- #else
- #define EMMS() asm("emms")
- #endif
- #else
--#define EMMS()
-+#define EMMS() ((void)0)
- #endif
- 
- }  // extern "C"
- 
- #endif  // MEDIA_BASE_YUV_ROW_H_
--- a/gfx/ycbcr/README
+++ b/gfx/ycbcr/README
@@ -1,31 +1,8 @@
 This color conversion code is from the Chromium open source project available here:
 
 http://code.google.com/chromium/
 
 The code comes from svn revision 63840 on 2010-10-26.
 
-If you just want to check out this individual directory, use:
-
-svn co -r 63840 http://src.chromium.org/svn/trunk/src/media/base
-
-The code was copied from a Chromium svn checkout using the 'update.sh' script which then applies patches for our build and to add dynamic CPU detection.
-
-convert.patch contains the following changes:
-
-  * Change Chromium code to build using Mozilla build system.
-  * Add runtime CPU detection for MMX
-  * Move default C implementation to work on all platforms.
-  * Change Chromium code to allow a picture region.
-  * The YUV conversion will convert within this picture region only.
-  * Add YCbCr 4:4:4 support
-  * Bug 619178 - Update CPU detection in yuv_convert to new SSE.h interface.
-  * Bug 616778 - Split yuv_convert FilterRows vectorized code into separate files so it can
-    be properly guarded with cpuid() calls.
-
-win64.patch: SSE2 optimization for Microsoft Visual C++ x64 version
-
-TypeFromSize.patch: Bug 656185 - Add a method to detect YUVType from plane sizes.
-
-QuellGccWarnings.patch: Bug 711895 - Avoid some GCC compilation warnings.
-
-clang-cl-workaround.patch: Bug 1422368 - Work around a clang-cl unresolved symbol bug.
+It has been superseded upstream by libyuv (which is spawned off it). Bug 791941 is about
+trying to replace this code with libyuv.
deleted file mode 100644
--- a/gfx/ycbcr/TypeFromSize.patch
+++ /dev/null
@@ -1,58 +0,0 @@
-diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
---- a/gfx/ycbcr/yuv_convert.cpp
-+++ b/gfx/ycbcr/yuv_convert.cpp
-@@ -26,16 +26,32 @@ namespace mozilla {
- 
- namespace gfx {
-  
- // 16.16 fixed point arithmetic
- const int kFractionBits = 16;
- const int kFractionMax = 1 << kFractionBits;
- const int kFractionMask = ((1 << kFractionBits) - 1);
- 
-+YUVType TypeFromSize(int ywidth, 
-+                     int yheight, 
-+                     int cbcrwidth, 
-+                     int cbcrheight)
-+{
-+  if (ywidth == cbcrwidth && yheight == cbcrheight) {
-+    return YV24;
-+  }
-+  else if (ywidth / 2 == cbcrwidth && yheight == cbcrheight) {
-+    return YV16;
-+  }
-+  else {
-+    return YV12;
-+  }
-+}
-+
- // Convert a frame of YUV to 32 bit ARGB.
- void ConvertYCbCrToRGB32(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int pic_x,
-                          int pic_y,
-                          int pic_width,
-diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
---- a/gfx/ycbcr/yuv_convert.h
-+++ b/gfx/ycbcr/yuv_convert.h
-@@ -36,16 +36,18 @@ enum Rotate {
- // Filter affects how scaling looks.
- enum ScaleFilter {
-   FILTER_NONE = 0,        // No filter (point sampled).
-   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
-   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
-   FILTER_BILINEAR = 3     // Bilinear filter.
- };
- 
-+YUVType TypeFromSize(int ywidth, int yheight, int cbcrwidth, int cbcrheight);
-+
- // Convert a frame of YUV to 32 bit ARGB.
- // Pass in YV16/YV12 depending on source format
- void ConvertYCbCrToRGB32(const uint8* yplane,
-                          const uint8* uplane,
-                          const uint8* vplane,
-                          uint8* rgbframe,
-                          int pic_x,
-                          int pic_y,
deleted file mode 100644
--- a/gfx/ycbcr/clang-cl-workaround.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
---- a/gfx/ycbcr/yuv_row_win.cpp
-+++ b/gfx/ycbcr/yuv_row_win.cpp
-@@ -7,19 +7,21 @@
- 
- #define kCoefficientsRgbU kCoefficientsRgbY + 2048
- #define kCoefficientsRgbV kCoefficientsRgbY + 4096
- 
- extern "C" {
- 
- #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
- #if defined(__clang__)
--// clang-cl may erroneously discard the symbol `kCoefficientsRgbY`
--// https://bugs.llvm.org/show_bug.cgi?id=35290
--volatile auto keep_kCoefficientsRgbY_alive = &kCoefficientsRgbY;
-+// clang-cl has a bug where it doesn't mangle names in inline asm
-+// so let's do the mangling in the preprocessor (ugh)
-+// (but we still need to declare a dummy extern for the parser)
-+extern void* _kCoefficientsRgbY;
-+#define kCoefficientsRgbY _kCoefficientsRgbY
- #endif
- 
- __declspec(naked)
- void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* rgb_buf,
-                                   int width) {
deleted file mode 100644
--- a/gfx/ycbcr/convert.patch
+++ /dev/null
@@ -1,3143 +0,0 @@
-diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
---- a/gfx/ycbcr/yuv_convert.cpp
-+++ b/gfx/ycbcr/yuv_convert.cpp
-@@ -6,145 +6,102 @@
- // http://www.fourcc.org/yuv.php
- // The actual conversion is best described here
- // http://en.wikipedia.org/wiki/YUV
- // An article on optimizing YUV conversion using tables instead of multiplies
- // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
- //
- // YV12 is a full plane of Y and a half height, half width chroma planes
- // YV16 is a full plane of Y and a full height, half width chroma planes
-+// YV24 is a full plane of Y and a full height, full width chroma planes
- //
- // ARGB pixel format is output, which on little endian is stored as BGRA.
- // The alpha is set to 255, allowing the application to use RGBA or RGB32.
- 
--#include "media/base/yuv_convert.h"
-+#include "yuv_convert.h"
- 
- // Header for low level row functions.
--#include "media/base/yuv_row.h"
--
--#if USE_MMX
--#if defined(_MSC_VER)
--#include <intrin.h>
--#else
--#include <mmintrin.h>
--#endif
--#endif
--
--#if USE_SSE2
--#include <emmintrin.h>
--#endif
--
--namespace media {
--
-+#include "yuv_row.h"
-+#include "mozilla/SSE.h"
-+
-+namespace mozilla {
-+
-+namespace gfx {
-+ 
- // 16.16 fixed point arithmetic
- const int kFractionBits = 16;
- const int kFractionMax = 1 << kFractionBits;
- const int kFractionMask = ((1 << kFractionBits) - 1);
- 
- // Convert a frame of YUV to 32 bit ARGB.
--void ConvertYUVToRGB32(const uint8* y_buf,
--                       const uint8* u_buf,
--                       const uint8* v_buf,
--                       uint8* rgb_buf,
--                       int width,
--                       int height,
--                       int y_pitch,
--                       int uv_pitch,
--                       int rgb_pitch,
--                       YUVType yuv_type) {
--  unsigned int y_shift = yuv_type;
--  for (int y = 0; y < height; ++y) {
--    uint8* rgb_row = rgb_buf + y * rgb_pitch;
--    const uint8* y_ptr = y_buf + y * y_pitch;
--    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
--    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
--
--    FastConvertYUVToRGB32Row(y_ptr,
--                             u_ptr,
--                             v_ptr,
--                             rgb_row,
--                             width);
--  }
-+void ConvertYCbCrToRGB32(const uint8* y_buf,
-+                         const uint8* u_buf,
-+                         const uint8* v_buf,
-+                         uint8* rgb_buf,
-+                         int pic_x,
-+                         int pic_y,
-+                         int pic_width,
-+                         int pic_height,
-+                         int y_pitch,
-+                         int uv_pitch,
-+                         int rgb_pitch,
-+                         YUVType yuv_type) {
-+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
-+  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
-+  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
-+  bool has_sse = supports_mmx() && supports_sse();
-+  // There is no optimized YV24 SSE routine so we check for this and
-+  // fall back to the C code.
-+  has_sse &= yuv_type != YV24;
-+  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
-+  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
-+
-+  for (int y = pic_y; y < pic_height + pic_y; ++y) {
-+    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
-+    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
-+    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
-+    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
-+
-+    if (odd_pic_x) {
-+      // Handle the single odd pixel manually and use the
-+      // fast routines for the remaining.
-+      FastConvertYUVToRGB32Row_C(y_ptr++,
-+                                 u_ptr++,
-+                                 v_ptr++,
-+                                 rgb_row,
-+                                 1,
-+                                 x_shift);
-+      rgb_row += 4;
-+    }
-+
-+    if (has_sse) {
-+      FastConvertYUVToRGB32Row(y_ptr,
-+                               u_ptr,
-+                               v_ptr,
-+                               rgb_row,
-+                               x_width);
-+    }
-+    else {
-+      FastConvertYUVToRGB32Row_C(y_ptr,
-+                                 u_ptr,
-+                                 v_ptr,
-+                                 rgb_row,
-+                                 x_width,
-+                                 x_shift);
-+    }
-+  }
- 
-   // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
--  EMMS();
--}
--
--#if USE_SSE2
--// FilterRows combines two rows of the image using linear interpolation.
--// SSE2 version does 16 pixels at a time
--
--static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
--                       int source_width, int source_y_fraction) {
--  __m128i zero = _mm_setzero_si128();
--  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
--  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
--
--  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
--  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
--  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
--  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
--
--  do {
--    __m128i y0 = _mm_loadu_si128(y0_ptr128);
--    __m128i y1 = _mm_loadu_si128(y1_ptr128);
--    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
--    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
--    y0 = _mm_unpacklo_epi8(y0, zero);
--    y1 = _mm_unpacklo_epi8(y1, zero);
--    y0 = _mm_mullo_epi16(y0, y0_fraction);
--    y1 = _mm_mullo_epi16(y1, y1_fraction);
--    y2 = _mm_mullo_epi16(y2, y0_fraction);
--    y3 = _mm_mullo_epi16(y3, y1_fraction);
--    y0 = _mm_add_epi16(y0, y1);
--    y2 = _mm_add_epi16(y2, y3);
--    y0 = _mm_srli_epi16(y0, 8);
--    y2 = _mm_srli_epi16(y2, 8);
--    y0 = _mm_packus_epi16(y0, y2);
--    *dest128++ = y0;
--    ++y0_ptr128;
--    ++y1_ptr128;
--  } while (dest128 < end128);
--}
--#elif USE_MMX
--// MMX version does 8 pixels at a time
--static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
--                       int source_width, int source_y_fraction) {
--  __m64 zero = _mm_setzero_si64();
--  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
--  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
--
--  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
--  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
--  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
--  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
--
--  do {
--    __m64 y0 = *y0_ptr64++;
--    __m64 y1 = *y1_ptr64++;
--    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
--    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
--    y0 = _mm_unpacklo_pi8(y0, zero);
--    y1 = _mm_unpacklo_pi8(y1, zero);
--    y0 = _mm_mullo_pi16(y0, y0_fraction);
--    y1 = _mm_mullo_pi16(y1, y1_fraction);
--    y2 = _mm_mullo_pi16(y2, y0_fraction);
--    y3 = _mm_mullo_pi16(y3, y1_fraction);
--    y0 = _mm_add_pi16(y0, y1);
--    y2 = _mm_add_pi16(y2, y3);
--    y0 = _mm_srli_pi16(y0, 8);
--    y2 = _mm_srli_pi16(y2, 8);
--    y0 = _mm_packs_pu16(y0, y2);
--    *dest64++ = y0;
--  } while (dest64 < end64);
--}
--#else  // no MMX or SSE2
-+  if (has_sse)
-+    EMMS();
-+}
-+
- // C version does 8 at a time to mimic MMX code
--static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
--                       int source_width, int source_y_fraction) {
-+static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-+                         int source_width, int source_y_fraction) {
-   int y1_fraction = source_y_fraction;
-   int y0_fraction = 256 - y1_fraction;
-   uint8* end = ybuf + source_width;
-   do {
-     ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
-     ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
-     ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
-     ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
-@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
-     ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
-     ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
-     ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
-     y0_ptr += 8;
-     y1_ptr += 8;
-     ybuf += 8;
-   } while (ybuf < end);
- }
--#endif
-+
-+#ifdef MOZILLA_MAY_SUPPORT_MMX
-+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-+                    int source_width, int source_y_fraction);
-+#endif
-+
-+#ifdef MOZILLA_MAY_SUPPORT_SSE2
-+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-+                     int source_width, int source_y_fraction);
-+#endif
-+
-+static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
-+                              const uint8* y1_ptr, int source_width,
-+                              int source_y_fraction) {
-+#ifdef MOZILLA_MAY_SUPPORT_SSE2
-+  if (mozilla::supports_sse2()) {
-+    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
-+    return;
-+  }
-+#endif
-+
-+#ifdef MOZILLA_MAY_SUPPORT_MMX
-+  if (mozilla::supports_mmx()) {
-+    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
-+    return;
-+  }
-+#endif
-+
-+  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
-+}
- 
- 
- // Scale a frame of YUV to 32 bit ARGB.
--void ScaleYUVToRGB32(const uint8* y_buf,
--                     const uint8* u_buf,
--                     const uint8* v_buf,
--                     uint8* rgb_buf,
--                     int source_width,
--                     int source_height,
--                     int width,
--                     int height,
--                     int y_pitch,
--                     int uv_pitch,
--                     int rgb_pitch,
--                     YUVType yuv_type,
--                     Rotate view_rotate,
--                     ScaleFilter filter) {
-+void ScaleYCbCrToRGB32(const uint8* y_buf,
-+                       const uint8* u_buf,
-+                       const uint8* v_buf,
-+                       uint8* rgb_buf,
-+                       int source_width,
-+                       int source_height,
-+                       int width,
-+                       int height,
-+                       int y_pitch,
-+                       int uv_pitch,
-+                       int rgb_pitch,
-+                       YUVType yuv_type,
-+                       Rotate view_rotate,
-+                       ScaleFilter filter) {
-+  bool has_mmx = supports_mmx();
-+
-   // 4096 allows 3 buffers to fit in 12k.
-   // Helps performance on CPU with 16K L1 cache.
-   // Large enough for 3830x2160 and 30" displays which are 2560x1600.
-   const int kFilterBufferSize = 4096;
-   // Disable filtering if the screen is too big (to avoid buffer overflows).
-   // This should never happen to regular users: they don't have monitors
-   // wider than 4096 pixels.
-   // TODO(fbarchard): Allow rotated videos to filter.
-   if (source_width > kFilterBufferSize || view_rotate)
-     filter = FILTER_NONE;
- 
--  unsigned int y_shift = yuv_type;
-+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
-   // Diagram showing origin and direction of source sampling.
-   // ->0   4<-
-   // 7       3
-   //
-   // 6       5
-   // ->1   2<-
-   // Rotations that start at right side of image.
-   if ((view_rotate == ROTATE_180) ||
-@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
-     int source_uv_fraction =
-         ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
- 
-     const uint8* y_ptr = y0_ptr;
-     const uint8* u_ptr = u0_ptr;
-     const uint8* v_ptr = v0_ptr;
-     // Apply vertical filtering if necessary.
-     // TODO(fbarchard): Remove memcpy when not necessary.
--    if (filter & media::FILTER_BILINEAR_V) {
-+    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
-       if (yscale_fixed != kFractionMax &&
-           source_y_fraction && ((source_y + 1) < source_height)) {
-         FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
-       } else {
-         memcpy(ybuf, y0_ptr, source_width);
-       }
-       y_ptr = ybuf;
-       ybuf[source_width] = ybuf[source_width-1];
-@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
-       u_ptr = ubuf;
-       v_ptr = vbuf;
-       ubuf[uv_source_width] = ubuf[uv_source_width - 1];
-       vbuf[uv_source_width] = vbuf[uv_source_width - 1];
-     }
-     if (source_dx == kFractionMax) {  // Not scaled
-       FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                                dest_pixel, width);
--    } else {
--      if (filter & FILTER_BILINEAR_H) {
-+    } else if (filter & FILTER_BILINEAR_H) {
-         LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                                  dest_pixel, width, source_dx);
-     } else {
- // Specialized scalers and rotation.
--#if USE_MMX && defined(_MSC_VER)
-+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) && !defined(__clang__)
-+      if(mozilla::supports_sse()) {
-         if (width == (source_width * 2)) {
--          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
--                              dest_pixel, width);
-+          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
-+                                  dest_pixel, width);
-         } else if ((source_dx & kFractionMask) == 0) {
-           // Scaling by integer scale factor. ie half.
--          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
--                               dest_pixel, width,
--                               source_dx >> kFractionBits);
-+          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
-+                                   dest_pixel, width,
-+                                   source_dx >> kFractionBits);
-         } else if (source_dx_uv == source_dx) {  // Not rotated.
-           ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                              dest_pixel, width, source_dx);
-         } else {
--          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
--                                     dest_pixel, width,
--                                     source_dx >> kFractionBits,
--                                     source_dx_uv >> kFractionBits);
-+          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
-+                                         dest_pixel, width,
-+                                         source_dx >> kFractionBits,
-+                                         source_dx_uv >> kFractionBits);
-         }
-+      }
-+      else {
-+        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
-+                             dest_pixel, width, source_dx);
-+      }
- #else
--        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
--                           dest_pixel, width, source_dx);
--#endif
--      }
-+      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-+                         dest_pixel, width, source_dx);
-+#endif
-     }
-   }
-   // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
--  EMMS();
--}
--
--}  // namespace media
-+  if (has_mmx)
-+    EMMS();
-+}
-+
-+}  // namespace gfx
-+}  // namespace mozilla
-diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
---- a/gfx/ycbcr/yuv_convert.h
-+++ b/gfx/ycbcr/yuv_convert.h
-@@ -1,72 +1,79 @@
- // Copyright (c) 2010 The Chromium Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- 
- #ifndef MEDIA_BASE_YUV_CONVERT_H_
- #define MEDIA_BASE_YUV_CONVERT_H_
- 
--#include "base/basictypes.h"
--
--namespace media {
--
-+#include "chromium_types.h"
-+#include "gfxCore.h"
-+
-+namespace mozilla {
-+
-+namespace gfx {
-+ 
- // Type of YUV surface.
- // The value of these enums matter as they are used to shift vertical indices.
- enum YUVType {
--  YV16 = 0,           // YV16 is half width and full height chroma channels.
--  YV12 = 1,           // YV12 is half width and half height chroma channels.
-+  YV12 = 0,           // YV12 is half width and half height chroma channels.
-+  YV16 = 1,           // YV16 is half width and full height chroma channels.
-+  YV24 = 2            // YV24 is full width and full height chroma channels.
- };
- 
- // Mirror means flip the image horizontally, as in looking in a mirror.
- // Rotate happens after mirroring.
- enum Rotate {
-   ROTATE_0,           // Rotation off.
-   ROTATE_90,          // Rotate clockwise.
-   ROTATE_180,         // Rotate upside down.
-   ROTATE_270,         // Rotate counter clockwise.
-   MIRROR_ROTATE_0,    // Mirror horizontally.
-   MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
-   MIRROR_ROTATE_180,  // Mirror vertically.
--  MIRROR_ROTATE_270,  // Transpose.
-+  MIRROR_ROTATE_270   // Transpose.
- };
- 
- // Filter affects how scaling looks.
- enum ScaleFilter {
-   FILTER_NONE = 0,        // No filter (point sampled).
-   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
-   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
--  FILTER_BILINEAR = 3,    // Bilinear filter.
-+  FILTER_BILINEAR = 3     // Bilinear filter.
- };
- 
- // Convert a frame of YUV to 32 bit ARGB.
- // Pass in YV16/YV12 depending on source format
--void ConvertYUVToRGB32(const uint8* yplane,
--                       const uint8* uplane,
--                       const uint8* vplane,
--                       uint8* rgbframe,
--                       int width,
--                       int height,
--                       int ystride,
--                       int uvstride,
--                       int rgbstride,
--                       YUVType yuv_type);
-+void ConvertYCbCrToRGB32(const uint8* yplane,
-+                         const uint8* uplane,
-+                         const uint8* vplane,
-+                         uint8* rgbframe,
-+                         int pic_x,
-+                         int pic_y,
-+                         int pic_width,
-+                         int pic_height,
-+                         int ystride,
-+                         int uvstride,
-+                         int rgbstride,
-+                         YUVType yuv_type);
- 
- // Scale a frame of YUV to 32 bit ARGB.
- // Supports rotation and mirroring.
--void ScaleYUVToRGB32(const uint8* yplane,
--                     const uint8* uplane,
--                     const uint8* vplane,
--                     uint8* rgbframe,
--                     int source_width,
--                     int source_height,
--                     int width,
--                     int height,
--                     int ystride,
--                     int uvstride,
--                     int rgbstride,
--                     YUVType yuv_type,
--                     Rotate view_rotate,
--                     ScaleFilter filter);
--
--}  // namespace media
--
-+void ScaleYCbCrToRGB32(const uint8* yplane,
-+                       const uint8* uplane,
-+                       const uint8* vplane,
-+                       uint8* rgbframe,
-+                       int source_width,
-+                       int source_height,
-+                       int width,
-+                       int height,
-+                       int ystride,
-+                       int uvstride,
-+                       int rgbstride,
-+                       YUVType yuv_type,
-+                       Rotate view_rotate,
-+                       ScaleFilter filter);
-+
-+}  // namespace gfx
-+}  // namespace mozilla
-+ 
- #endif  // MEDIA_BASE_YUV_CONVERT_H_
-diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
-new file mode 100644
---- /dev/null
-+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
-@@ -0,0 +1,45 @@
-+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-+// Use of this source code is governed by a BSD-style license that can be
-+// found in the LICENSE file.
-+
-+#include <mmintrin.h>
-+#include "yuv_row.h"
-+
-+namespace mozilla {
-+namespace gfx {
-+
-+// FilterRows combines two rows of the image using linear interpolation.
-+// MMX version does 8 pixels at a time.
-+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-+                    int source_width, int source_y_fraction) {
-+  __m64 zero = _mm_setzero_si64();
-+  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
-+  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
-+
-+  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
-+  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
-+  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
-+  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
-+
-+  do {
-+    __m64 y0 = *y0_ptr64++;
-+    __m64 y1 = *y1_ptr64++;
-+    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
-+    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
-+    y0 = _mm_unpacklo_pi8(y0, zero);
-+    y1 = _mm_unpacklo_pi8(y1, zero);
-+    y0 = _mm_mullo_pi16(y0, y0_fraction);
-+    y1 = _mm_mullo_pi16(y1, y1_fraction);
-+    y2 = _mm_mullo_pi16(y2, y0_fraction);
-+    y3 = _mm_mullo_pi16(y3, y1_fraction);
-+    y0 = _mm_add_pi16(y0, y1);
-+    y2 = _mm_add_pi16(y2, y3);
-+    y0 = _mm_srli_pi16(y0, 8);
-+    y2 = _mm_srli_pi16(y2, 8);
-+    y0 = _mm_packs_pu16(y0, y2);
-+    *dest64++ = y0;
-+  } while (dest64 < end64);
-+}
-+
-+}
-+}
-diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
-new file mode 100644
---- /dev/null
-+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
-@@ -0,0 +1,47 @@
-+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-+// Use of this source code is governed by a BSD-style license that can be
-+// found in the LICENSE file.
-+
-+#include <emmintrin.h>
-+#include "yuv_row.h"
-+
-+namespace mozilla {
-+namespace gfx {
-+
-+// FilterRows combines two rows of the image using linear interpolation.
-+// SSE2 version does 16 pixels at a time.
-+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-+                     int source_width, int source_y_fraction) {
-+  __m128i zero = _mm_setzero_si128();
-+  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
-+  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
-+
-+  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
-+  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
-+  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
-+  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
-+
-+  do {
-+    __m128i y0 = _mm_loadu_si128(y0_ptr128);
-+    __m128i y1 = _mm_loadu_si128(y1_ptr128);
-+    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
-+    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
-+    y0 = _mm_unpacklo_epi8(y0, zero);
-+    y1 = _mm_unpacklo_epi8(y1, zero);
-+    y0 = _mm_mullo_epi16(y0, y0_fraction);
-+    y1 = _mm_mullo_epi16(y1, y1_fraction);
-+    y2 = _mm_mullo_epi16(y2, y0_fraction);
-+    y3 = _mm_mullo_epi16(y3, y1_fraction);
-+    y0 = _mm_add_epi16(y0, y1);
-+    y2 = _mm_add_epi16(y2, y3);
-+    y0 = _mm_srli_epi16(y0, 8);
-+    y2 = _mm_srli_epi16(y2, 8);
-+    y0 = _mm_packus_epi16(y0, y2);
-+    *dest128++ = y0;
-+    ++y0_ptr128;
-+    ++y1_ptr128;
-+  } while (dest128 < end128);
-+}
-+
-+}
-+}
-diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
---- a/gfx/ycbcr/yuv_row.h
-+++ b/gfx/ycbcr/yuv_row.h
-@@ -5,109 +5,133 @@
- // yuv_row internal functions to handle YUV conversion and scaling to RGB.
- // These functions are used from both yuv_convert.cc and yuv_scale.cc.
- 
- // TODO(fbarchard): Write function that can handle rotation and scaling.
- 
- #ifndef MEDIA_BASE_YUV_ROW_H_
- #define MEDIA_BASE_YUV_ROW_H_
- 
--#include "base/basictypes.h"
-+#include "chromium_types.h"
- 
- extern "C" {
- // Can only do 1x.
- // This is the second fastest of the scalers.
- void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width);
- 
--// Can do 1x, half size or any scale down by an integer amount.
--// Step can be negative (mirroring, rotate 180).
--// This is the third fastest of the scalers.
--void ConvertYUVToRGB32Row(const uint8* y_buf,
--                          const uint8* u_buf,
--                          const uint8* v_buf,
--                          uint8* rgb_buf,
--                          int width,
--                          int step);
--
--// Rotate is like Convert, but applies different step to Y versus U and V.
--// This allows rotation by 90 or 270, by stepping by stride.
--// This is the forth fastest of the scalers.
--void RotateConvertYUVToRGB32Row(const uint8* y_buf,
-+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width,
--                                int ystep,
--                                int uvstep);
-+                                unsigned int x_shift);
-+
-+void FastConvertYUVToRGB32Row(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width);
-+
-+// Can do 1x, half size or any scale down by an integer amount.
-+// Step can be negative (mirroring, rotate 180).
-+// This is the third fastest of the scalers.
-+// Only defined on Windows x86-32.
-+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int step);
-+
-+// Rotate is like Convert, but applies different step to Y versus U and V.
-+// This allows rotation by 90 or 270, by stepping by stride.
-+// This is the forth fastest of the scalers.
-+// Only defined on Windows x86-32.
-+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                    const uint8* u_buf,
-+                                    const uint8* v_buf,
-+                                    uint8* rgb_buf,
-+                                    int width,
-+                                    int ystep,
-+                                    int uvstep);
- 
- // Doubler does 4 pixels at a time.  Each pixel is replicated.
- // This is the fastest of the scalers.
--void DoubleYUVToRGB32Row(const uint8* y_buf,
--                         const uint8* u_buf,
--                         const uint8* v_buf,
--                         uint8* rgb_buf,
--                         int width);
-+// Only defined on Windows x86-32.
-+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                             const uint8* u_buf,
-+                             const uint8* v_buf,
-+                             uint8* rgb_buf,
-+                             int width);
- 
- // Handles arbitrary scaling up or down.
- // Mirroring is supported, but not 90 or 270 degree rotation.
- // Chroma is under sampled every 2 pixels for performance.
- void ScaleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width,
-                         int source_dx);
- 
-+void ScaleYUVToRGB32Row(const uint8* y_buf,
-+                        const uint8* u_buf,
-+                        const uint8* v_buf,
-+                        uint8* rgb_buf,
-+                        int width,
-+                        int source_dx);
-+
-+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
-+                          const uint8* u_buf,
-+                          const uint8* v_buf,
-+                          uint8* rgb_buf,
-+                          int width,
-+                          int source_dx);
-+
- // Handles arbitrary scaling up or down with bilinear filtering.
- // Mirroring is supported, but not 90 or 270 degree rotation.
- // Chroma is under sampled every 2 pixels for performance.
- // This is the slowest of the scalers.
- void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width,
-                               int source_dx);
- 
-+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int source_dx);
-+
-+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
-+                                const uint8* u_buf,
-+                                const uint8* v_buf,
-+                                uint8* rgb_buf,
-+                                int width,
-+                                int source_dx);
-+
-+
- #if defined(_MSC_VER)
- #define SIMD_ALIGNED(var) __declspec(align(16)) var
- #else
- #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
- #endif
- extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
- 
--// Method to force C version.
--//#define USE_MMX 0
--//#define USE_SSE2 0
--
--#if !defined(USE_MMX)
--// Windows, Mac and Linux/BSD use MMX
--#if defined(__MMX__) || defined(_MSC_VER)
--#define USE_MMX 1
--#else
--#define USE_MMX 0
--#endif
--#endif
--
--#if !defined(USE_SSE2)
--#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
--#define USE_SSE2 1
--#else
--#define USE_SSE2 0
--#endif
--#endif
--
- // x64 uses MMX2 (SSE) so emms is not required.
- // Warning C4799: function has no EMMS instruction.
- // EMMS() is slow and should be called by the calling function once per image.
--#if USE_MMX && !defined(ARCH_CPU_X86_64)
-+#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
- #if defined(_MSC_VER)
- #define EMMS() __asm emms
- #pragma warning(disable: 4799)
- #else
- #define EMMS() asm("emms")
- #endif
- #else
- #define EMMS()
-diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
---- a/gfx/ycbcr/yuv_row_c.cpp
-+++ b/gfx/ycbcr/yuv_row_c.cpp
-@@ -1,812 +1,18 @@
- // Copyright (c) 2010 The Chromium Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- 
--#include "media/base/yuv_row.h"
--
--#ifdef _DEBUG
--#include "base/logging.h"
--#else
-+#include "yuv_row.h"
-+
- #define DCHECK(a)
--#endif
- 
- extern "C" {
- 
--#if USE_SSE2 && defined(ARCH_CPU_X86_64)
--
--// AMD64 ABI uses register paremters.
--void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
--                              const uint8* u_buf,  // rsi
--                              const uint8* v_buf,  // rdx
--                              uint8* rgb_buf,      // rcx
--                              int width) {         // r8
--  asm(
--  "jmp    convertend\n"
--"convertloop:"
--  "movzb  (%1),%%r10\n"
--  "add    $0x1,%1\n"
--  "movzb  (%2),%%r11\n"
--  "add    $0x1,%2\n"
--  "movq   2048(%5,%%r10,8),%%xmm0\n"
--  "movzb  (%0),%%r10\n"
--  "movq   4096(%5,%%r11,8),%%xmm1\n"
--  "movzb  0x1(%0),%%r11\n"
--  "paddsw %%xmm1,%%xmm0\n"
--  "movq   (%5,%%r10,8),%%xmm2\n"
--  "add    $0x2,%0\n"
--  "movq   (%5,%%r11,8),%%xmm3\n"
--  "paddsw %%xmm0,%%xmm2\n"
--  "paddsw %%xmm0,%%xmm3\n"
--  "shufps $0x44,%%xmm3,%%xmm2\n"
--  "psraw  $0x6,%%xmm2\n"
--  "packuswb %%xmm2,%%xmm2\n"
--  "movq   %%xmm2,0x0(%3)\n"
--  "add    $0x8,%3\n"
--"convertend:"
--  "sub    $0x2,%4\n"
--  "jns    convertloop\n"
--
--"convertnext:"
--  "add    $0x1,%4\n"
--  "js     convertdone\n"
--
--  "movzb  (%1),%%r10\n"
--  "movq   2048(%5,%%r10,8),%%xmm0\n"
--  "movzb  (%2),%%r10\n"
--  "movq   4096(%5,%%r10,8),%%xmm1\n"
--  "paddsw %%xmm1,%%xmm0\n"
--  "movzb  (%0),%%r10\n"
--  "movq   (%5,%%r10,8),%%xmm1\n"
--  "paddsw %%xmm0,%%xmm1\n"
--  "psraw  $0x6,%%xmm1\n"
--  "packuswb %%xmm1,%%xmm1\n"
--  "movd   %%xmm1,0x0(%3)\n"
--"convertdone:"
--  :
--  : "r"(y_buf),  // %0
--    "r"(u_buf),  // %1
--    "r"(v_buf),  // %2
--    "r"(rgb_buf),  // %3
--    "r"(width),  // %4
--    "r" (kCoefficientsRgbY)  // %5
--  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
--);
--}
--
--void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
--                        const uint8* u_buf,  // rsi
--                        const uint8* v_buf,  // rdx
--                        uint8* rgb_buf,      // rcx
--                        int width,           // r8
--                        int source_dx) {     // r9
--  asm(
--  "xor    %%r11,%%r11\n"
--  "sub    $0x2,%4\n"
--  "js     scalenext\n"
--
--"scaleloop:"
--  "mov    %%r11,%%r10\n"
--  "sar    $0x11,%%r10\n"
--  "movzb  (%1,%%r10,1),%%rax\n"
--  "movq   2048(%5,%%rax,8),%%xmm0\n"
--  "movzb  (%2,%%r10,1),%%rax\n"
--  "movq   4096(%5,%%rax,8),%%xmm1\n"
--  "lea    (%%r11,%6),%%r10\n"
--  "sar    $0x10,%%r11\n"
--  "movzb  (%0,%%r11,1),%%rax\n"
--  "paddsw %%xmm1,%%xmm0\n"
--  "movq   (%5,%%rax,8),%%xmm1\n"
--  "lea    (%%r10,%6),%%r11\n"
--  "sar    $0x10,%%r10\n"
--  "movzb  (%0,%%r10,1),%%rax\n"
--  "movq   (%5,%%rax,8),%%xmm2\n"
--  "paddsw %%xmm0,%%xmm1\n"
--  "paddsw %%xmm0,%%xmm2\n"
--  "shufps $0x44,%%xmm2,%%xmm1\n"
--  "psraw  $0x6,%%xmm1\n"
--  "packuswb %%xmm1,%%xmm1\n"
--  "movq   %%xmm1,0x0(%3)\n"
--  "add    $0x8,%3\n"
--  "sub    $0x2,%4\n"
--  "jns    scaleloop\n"
--
--"scalenext:"
--  "add    $0x1,%4\n"
--  "js     scaledone\n"
--
--  "mov    %%r11,%%r10\n"
--  "sar    $0x11,%%r10\n"
--  "movzb  (%1,%%r10,1),%%rax\n"
--  "movq   2048(%5,%%rax,8),%%xmm0\n"
--  "movzb  (%2,%%r10,1),%%rax\n"
--  "movq   4096(%5,%%rax,8),%%xmm1\n"
--  "paddsw %%xmm1,%%xmm0\n"
--  "sar    $0x10,%%r11\n"
--  "movzb  (%0,%%r11,1),%%rax\n"
--  "movq   (%5,%%rax,8),%%xmm1\n"
--  "paddsw %%xmm0,%%xmm1\n"
--  "psraw  $0x6,%%xmm1\n"
--  "packuswb %%xmm1,%%xmm1\n"
--  "movd   %%xmm1,0x0(%3)\n"
--
--"scaledone:"
--  :
--  : "r"(y_buf),  // %0
--    "r"(u_buf),  // %1
--    "r"(v_buf),  // %2
--    "r"(rgb_buf),  // %3
--    "r"(width),  // %4
--    "r" (kCoefficientsRgbY),  // %5
--    "r"(static_cast<long>(source_dx))  // %6
--  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
--);
--}
--
--void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width,
--                              int source_dx) {
--  asm(
--  "xor    %%r11,%%r11\n"   // x = 0
--  "sub    $0x2,%4\n"
--  "js     .lscalenext\n"
--  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
--  "jl     .lscalehalf\n"
--  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
--".lscalehalf:"
--
--".lscaleloop:"
--  "mov    %%r11,%%r10\n"
--  "sar    $0x11,%%r10\n"
--
--  "movzb  (%1, %%r10, 1), %%r13 \n"
--  "movzb  1(%1, %%r10, 1), %%r14 \n"
--  "mov    %%r11, %%rax \n"
--  "and    $0x1fffe, %%rax \n"
--  "imul   %%rax, %%r14 \n"
--  "xor    $0x1fffe, %%rax \n"
--  "imul   %%rax, %%r13 \n"
--  "add    %%r14, %%r13 \n"
--  "shr    $17, %%r13 \n"
--  "movq   2048(%5,%%r13,8), %%xmm0\n"
--
--  "movzb  (%2, %%r10, 1), %%r13 \n"
--  "movzb  1(%2, %%r10, 1), %%r14 \n"
--  "mov    %%r11, %%rax \n"
--  "and    $0x1fffe, %%rax \n"
--  "imul   %%rax, %%r14 \n"
--  "xor    $0x1fffe, %%rax \n"
--  "imul   %%rax, %%r13 \n"
--  "add    %%r14, %%r13 \n"
--  "shr    $17, %%r13 \n"
--  "movq   4096(%5,%%r13,8), %%xmm1\n"
--
--  "mov    %%r11, %%rax \n"
--  "lea    (%%r11,%6),%%r10\n"
--  "sar    $0x10,%%r11\n"
--  "paddsw %%xmm1,%%xmm0\n"
--
--  "movzb  (%0, %%r11, 1), %%r13 \n"
--  "movzb  1(%0, %%r11, 1), %%r14 \n"
--  "and    $0xffff, %%rax \n"
--  "imul   %%rax, %%r14 \n"
--  "xor    $0xffff, %%rax \n"
--  "imul   %%rax, %%r13 \n"
--  "add    %%r14, %%r13 \n"
--  "shr    $16, %%r13 \n"
--  "movq   (%5,%%r13,8),%%xmm1\n"
--
--  "mov    %%r10, %%rax \n"
--  "lea    (%%r10,%6),%%r11\n"
--  "sar    $0x10,%%r10\n"
--
--  "movzb  (%0,%%r10,1), %%r13 \n"
--  "movzb  1(%0,%%r10,1), %%r14 \n"
--  "and    $0xffff, %%rax \n"
--  "imul   %%rax, %%r14 \n"
--  "xor    $0xffff, %%rax \n"
--  "imul   %%rax, %%r13 \n"
--  "add    %%r14, %%r13 \n"
--  "shr    $16, %%r13 \n"
--  "movq   (%5,%%r13,8),%%xmm2\n"
--
--  "paddsw %%xmm0,%%xmm1\n"
--  "paddsw %%xmm0,%%xmm2\n"
--  "shufps $0x44,%%xmm2,%%xmm1\n"
--  "psraw  $0x6,%%xmm1\n"
--  "packuswb %%xmm1,%%xmm1\n"
--  "movq   %%xmm1,0x0(%3)\n"
--  "add    $0x8,%3\n"
--  "sub    $0x2,%4\n"
--  "jns    .lscaleloop\n"
--
--".lscalenext:"
--  "add    $0x1,%4\n"
--  "js     .lscaledone\n"
--
--  "mov    %%r11,%%r10\n"
--  "sar    $0x11,%%r10\n"
--
--  "movzb  (%1,%%r10,1), %%r13 \n"
--  "movq   2048(%5,%%r13,8),%%xmm0\n"
--
--  "movzb  (%2,%%r10,1), %%r13 \n"
--  "movq   4096(%5,%%r13,8),%%xmm1\n"
--
--  "paddsw %%xmm1,%%xmm0\n"
--  "sar    $0x10,%%r11\n"
--
--  "movzb  (%0,%%r11,1), %%r13 \n"
--  "movq   (%5,%%r13,8),%%xmm1\n"
--
--  "paddsw %%xmm0,%%xmm1\n"
--  "psraw  $0x6,%%xmm1\n"
--  "packuswb %%xmm1,%%xmm1\n"
--  "movd   %%xmm1,0x0(%3)\n"
--
--".lscaledone:"
--  :
--  : "r"(y_buf),  // %0
--    "r"(u_buf),  // %1
--    "r"(v_buf),  // %2
--    "r"(rgb_buf),  // %3
--    "r"(width),  // %4
--    "r" (kCoefficientsRgbY),  // %5
--    "r"(static_cast<long>(source_dx))  // %6
--  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
--);
--}
--
--#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
--
--// PIC version is slower because less registers are available, so
--// non-PIC is used on platforms where it is possible.
--
--void FastConvertYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width);
--  asm(
--  ".text\n"
--  ".global FastConvertYUVToRGB32Row\n"
--"FastConvertYUVToRGB32Row:\n"
--  "pusha\n"
--  "mov    0x24(%esp),%edx\n"
--  "mov    0x28(%esp),%edi\n"
--  "mov    0x2c(%esp),%esi\n"
--  "mov    0x30(%esp),%ebp\n"
--  "mov    0x34(%esp),%ecx\n"
--  "jmp    convertend\n"
--
--"convertloop:"
--  "movzbl (%edi),%eax\n"
--  "add    $0x1,%edi\n"
--  "movzbl (%esi),%ebx\n"
--  "add    $0x1,%esi\n"
--  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
--  "movzbl (%edx),%eax\n"
--  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
--  "movzbl 0x1(%edx),%ebx\n"
--  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
--  "add    $0x2,%edx\n"
--  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
--  "paddsw %mm0,%mm1\n"
--  "paddsw %mm0,%mm2\n"
--  "psraw  $0x6,%mm1\n"
--  "psraw  $0x6,%mm2\n"
--  "packuswb %mm2,%mm1\n"
--  "movntq %mm1,0x0(%ebp)\n"
--  "add    $0x8,%ebp\n"
--"convertend:"
--  "sub    $0x2,%ecx\n"
--  "jns    convertloop\n"
--
--  "and    $0x1,%ecx\n"
--  "je     convertdone\n"
--
--  "movzbl (%edi),%eax\n"
--  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
--  "movzbl (%esi),%eax\n"
--  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
--  "movzbl (%edx),%eax\n"
--  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
--  "paddsw %mm0,%mm1\n"
--  "psraw  $0x6,%mm1\n"
--  "packuswb %mm1,%mm1\n"
--  "movd   %mm1,0x0(%ebp)\n"
--"convertdone:"
--  "popa\n"
--  "ret\n"
--);
--
--
--void ScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx);
--  asm(
--  ".text\n"
--  ".global ScaleYUVToRGB32Row\n"
--"ScaleYUVToRGB32Row:\n"
--  "pusha\n"
--  "mov    0x24(%esp),%edx\n"
--  "mov    0x28(%esp),%edi\n"
--  "mov    0x2c(%esp),%esi\n"
--  "mov    0x30(%esp),%ebp\n"
--  "mov    0x34(%esp),%ecx\n"
--  "xor    %ebx,%ebx\n"
--  "jmp    scaleend\n"
--
--"scaleloop:"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%edi,%eax,1),%eax\n"
--  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%esi,%eax,1),%eax\n"
--  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%eax\n"
--  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%eax\n"
--  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
--  "paddsw %mm0,%mm1\n"
--  "paddsw %mm0,%mm2\n"
--  "psraw  $0x6,%mm1\n"
--  "psraw  $0x6,%mm2\n"
--  "packuswb %mm2,%mm1\n"
--  "movntq %mm1,0x0(%ebp)\n"
--  "add    $0x8,%ebp\n"
--"scaleend:"
--  "sub    $0x2,%ecx\n"
--  "jns    scaleloop\n"
--
--  "and    $0x1,%ecx\n"
--  "je     scaledone\n"
--
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%edi,%eax,1),%eax\n"
--  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%esi,%eax,1),%eax\n"
--  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%eax\n"
--  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
--  "paddsw %mm0,%mm1\n"
--  "psraw  $0x6,%mm1\n"
--  "packuswb %mm1,%mm1\n"
--  "movd   %mm1,0x0(%ebp)\n"
--
--"scaledone:"
--  "popa\n"
--  "ret\n"
--);
--
--void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width,
--                              int source_dx);
--  asm(
--  ".text\n"
--  ".global LinearScaleYUVToRGB32Row\n"
--"LinearScaleYUVToRGB32Row:\n"
--  "pusha\n"
--  "mov    0x24(%esp),%edx\n"
--  "mov    0x28(%esp),%edi\n"
--  "mov    0x30(%esp),%ebp\n"
--
--  // source_width = width * source_dx + ebx
--  "mov    0x34(%esp), %ecx\n"
--  "imull  0x38(%esp), %ecx\n"
--  "mov    %ecx, 0x34(%esp)\n"
--
--  "mov    0x38(%esp), %ecx\n"
--  "xor    %ebx,%ebx\n"     // x = 0
--  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
--  "jl     .lscaleend\n"
--  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
--  "jmp    .lscaleend\n"
--
--".lscaleloop:"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--
--  "movzbl (%edi,%eax,1),%ecx\n"
--  "movzbl 1(%edi,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "andl   $0x1fffe, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0x1fffe, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $17, %ecx \n"
--  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
--
--  "mov    0x2c(%esp),%esi\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--
--  "movzbl (%esi,%eax,1),%ecx\n"
--  "movzbl 1(%esi,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "andl   $0x1fffe, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0x1fffe, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $17, %ecx \n"
--  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
--
--  "mov    %ebx,%eax\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%ecx\n"
--  "movzbl 1(%edx,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "andl   $0xffff, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0xffff, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $16, %ecx \n"
--  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
--
--  "cmp    0x34(%esp), %ebx\n"
--  "jge    .lscalelastpixel\n"
--
--  "mov    %ebx,%eax\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%ecx\n"
--  "movzbl 1(%edx,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "andl   $0xffff, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0xffff, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $16, %ecx \n"
--  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
--
--  "paddsw %mm0,%mm1\n"
--  "paddsw %mm0,%mm2\n"
--  "psraw  $0x6,%mm1\n"
--  "psraw  $0x6,%mm2\n"
--  "packuswb %mm2,%mm1\n"
--  "movntq %mm1,0x0(%ebp)\n"
--  "add    $0x8,%ebp\n"
--
--".lscaleend:"
--  "cmp    0x34(%esp), %ebx\n"
--  "jl     .lscaleloop\n"
--  "popa\n"
--  "ret\n"
--
--".lscalelastpixel:"
--  "paddsw %mm0, %mm1\n"
--  "psraw $6, %mm1\n"
--  "packuswb %mm1, %mm1\n"
--  "movd %mm1, (%ebp)\n"
--  "popa\n"
--  "ret\n"
--);
--
--#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
--
--extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
--                                    const uint8* u_buf,
--                                    const uint8* v_buf,
--                                    uint8* rgb_buf,
--                                    int width,
--                                    int16 *kCoefficientsRgbY);
--  asm(
--  ".text\n"
--#if defined(OS_MACOSX)
--"_PICConvertYUVToRGB32Row:\n"
--#else
--"PICConvertYUVToRGB32Row:\n"
--#endif
--  "pusha\n"
--  "mov    0x24(%esp),%edx\n"
--  "mov    0x28(%esp),%edi\n"
--  "mov    0x2c(%esp),%esi\n"
--  "mov    0x30(%esp),%ebp\n"
--  "mov    0x38(%esp),%ecx\n"
--
--  "jmp    .Lconvertend\n"
--
--".Lconvertloop:"
--  "movzbl (%edi),%eax\n"
--  "add    $0x1,%edi\n"
--  "movzbl (%esi),%ebx\n"
--  "add    $0x1,%esi\n"
--  "movq   2048(%ecx,%eax,8),%mm0\n"
--  "movzbl (%edx),%eax\n"
--  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
--  "movzbl 0x1(%edx),%ebx\n"
--  "movq   0(%ecx,%eax,8),%mm1\n"
--  "add    $0x2,%edx\n"
--  "movq   0(%ecx,%ebx,8),%mm2\n"
--  "paddsw %mm0,%mm1\n"
--  "paddsw %mm0,%mm2\n"
--  "psraw  $0x6,%mm1\n"
--  "psraw  $0x6,%mm2\n"
--  "packuswb %mm2,%mm1\n"
--  "movntq %mm1,0x0(%ebp)\n"
--  "add    $0x8,%ebp\n"
--".Lconvertend:"
--  "subl   $0x2,0x34(%esp)\n"
--  "jns    .Lconvertloop\n"
--
--  "andl   $0x1,0x34(%esp)\n"
--  "je     .Lconvertdone\n"
--
--  "movzbl (%edi),%eax\n"
--  "movq   2048(%ecx,%eax,8),%mm0\n"
--  "movzbl (%esi),%eax\n"
--  "paddsw 4096(%ecx,%eax,8),%mm0\n"
--  "movzbl (%edx),%eax\n"
--  "movq   0(%ecx,%eax,8),%mm1\n"
--  "paddsw %mm0,%mm1\n"
--  "psraw  $0x6,%mm1\n"
--  "packuswb %mm1,%mm1\n"
--  "movd   %mm1,0x0(%ebp)\n"
--".Lconvertdone:\n"
--  "popa\n"
--  "ret\n"
--);
--
--void FastConvertYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width) {
--  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
--                          &kCoefficientsRgbY[0][0]);
--}
--
--extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
--                               const uint8* u_buf,
--                               const uint8* v_buf,
--                               uint8* rgb_buf,
--                               int width,
--                               int source_dx,
--                               int16 *kCoefficientsRgbY);
--
--  asm(
--  ".text\n"
--#if defined(OS_MACOSX)
--"_PICScaleYUVToRGB32Row:\n"
--#else
--"PICScaleYUVToRGB32Row:\n"
--#endif
--  "pusha\n"
--  "mov    0x24(%esp),%edx\n"
--  "mov    0x28(%esp),%edi\n"
--  "mov    0x2c(%esp),%esi\n"
--  "mov    0x30(%esp),%ebp\n"
--  "mov    0x3c(%esp),%ecx\n"
--  "xor    %ebx,%ebx\n"
--  "jmp    Lscaleend\n"
--
--"Lscaleloop:"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%edi,%eax,1),%eax\n"
--  "movq   2048(%ecx,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%esi,%eax,1),%eax\n"
--  "paddsw 4096(%ecx,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%eax\n"
--  "movq   0(%ecx,%eax,8),%mm1\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%eax\n"
--  "movq   0(%ecx,%eax,8),%mm2\n"
--  "paddsw %mm0,%mm1\n"
--  "paddsw %mm0,%mm2\n"
--  "psraw  $0x6,%mm1\n"
--  "psraw  $0x6,%mm2\n"
--  "packuswb %mm2,%mm1\n"
--  "movntq %mm1,0x0(%ebp)\n"
--  "add    $0x8,%ebp\n"
--"Lscaleend:"
--  "subl   $0x2,0x34(%esp)\n"
--  "jns    Lscaleloop\n"
--
--  "andl   $0x1,0x34(%esp)\n"
--  "je     Lscaledone\n"
--
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%edi,%eax,1),%eax\n"
--  "movq   2048(%ecx,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--  "movzbl (%esi,%eax,1),%eax\n"
--  "paddsw 4096(%ecx,%eax,8),%mm0\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%eax\n"
--  "movq   0(%ecx,%eax,8),%mm1\n"
--  "paddsw %mm0,%mm1\n"
--  "psraw  $0x6,%mm1\n"
--  "packuswb %mm1,%mm1\n"
--  "movd   %mm1,0x0(%ebp)\n"
--
--"Lscaledone:"
--  "popa\n"
--  "ret\n"
--);
--
--
--void ScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx) {
--  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
--                        &kCoefficientsRgbY[0][0]);
--}
--
--void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
--                                 const uint8* u_buf,
--                                 const uint8* v_buf,
--                                 uint8* rgb_buf,
--                                 int width,
--                                 int source_dx,
--                                 int16 *kCoefficientsRgbY);
--  asm(
--  ".text\n"
--#if defined(OS_MACOSX)
--"_PICLinearScaleYUVToRGB32Row:\n"
--#else
--"PICLinearScaleYUVToRGB32Row:\n"
--#endif
--  "pusha\n"
--  "mov    0x24(%esp),%edx\n"
--  "mov    0x30(%esp),%ebp\n"
--  "mov    0x34(%esp),%ecx\n"
--  "mov    0x3c(%esp),%edi\n"
--  "xor    %ebx,%ebx\n"
--
--  // source_width = width * source_dx + ebx
--  "mov    0x34(%esp), %ecx\n"
--  "imull  0x38(%esp), %ecx\n"
--  "mov    %ecx, 0x34(%esp)\n"
--
--  "mov    0x38(%esp), %ecx\n"
--  "xor    %ebx,%ebx\n"     // x = 0
--  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
--  "jl     .lscaleend\n"
--  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
--  "jmp    .lscaleend\n"
--
--".lscaleloop:"
--  "mov    0x28(%esp),%esi\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--
--  "movzbl (%esi,%eax,1),%ecx\n"
--  "movzbl 1(%esi,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "andl   $0x1fffe, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0x1fffe, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $17, %ecx \n"
--  "movq   2048(%edi,%ecx,8),%mm0\n"
--
--  "mov    0x2c(%esp),%esi\n"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
--
--  "movzbl (%esi,%eax,1),%ecx\n"
--  "movzbl 1(%esi,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "andl   $0x1fffe, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0x1fffe, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $17, %ecx \n"
--  "paddsw 4096(%edi,%ecx,8),%mm0\n"
--
--  "mov    %ebx,%eax\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%ecx\n"
--  "movzbl 1(%edx,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "andl   $0xffff, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0xffff, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $16, %ecx \n"
--  "movq   (%edi,%ecx,8),%mm1\n"
--
--  "cmp    0x34(%esp), %ebx\n"
--  "jge    .lscalelastpixel\n"
--
--  "mov    %ebx,%eax\n"
--  "sar    $0x10,%eax\n"
--  "movzbl (%edx,%eax,1),%ecx\n"
--  "movzbl 1(%edx,%eax,1),%esi\n"
--  "mov    %ebx,%eax\n"
--  "add    0x38(%esp),%ebx\n"
--  "andl   $0xffff, %eax \n"
--  "imul   %eax, %esi \n"
--  "xorl   $0xffff, %eax \n"
--  "imul   %eax, %ecx \n"
--  "addl   %esi, %ecx \n"
--  "shrl   $16, %ecx \n"
--  "movq   (%edi,%ecx,8),%mm2\n"
--
--  "paddsw %mm0,%mm1\n"
--  "paddsw %mm0,%mm2\n"
--  "psraw  $0x6,%mm1\n"
--  "psraw  $0x6,%mm2\n"
--  "packuswb %mm2,%mm1\n"
--  "movntq %mm1,0x0(%ebp)\n"
--  "add    $0x8,%ebp\n"
--
--".lscaleend:"
--  "cmp    %ebx, 0x34(%esp)\n"
--  "jg     .lscaleloop\n"
--  "popa\n"
--  "ret\n"
--
--".lscalelastpixel:"
--  "paddsw %mm0, %mm1\n"
--  "psraw $6, %mm1\n"
--  "packuswb %mm1, %mm1\n"
--  "movd %mm1, (%ebp)\n"
--  "popa\n"
--  "ret\n"
--);
--
--void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx) {
--  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
--                              &kCoefficientsRgbY[0][0]);
--}
--
--#else  // USE_MMX
--
- // C reference code that mimic the YUV assembly.
- #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
- #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
-     (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
- 
- static inline void YuvPixel(uint8 y,
-                             uint8 u,
-                             uint8 v,
-@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
-   a >>= 6;
- 
-   *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
-                                         (packuswb(g) << 8) |
-                                         (packuswb(r) << 16) |
-                                         (packuswb(a) << 24);
- }
- 
--void FastConvertYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width) {
-+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
-+                                const uint8* u_buf,
-+                                const uint8* v_buf,
-+                                uint8* rgb_buf,
-+                                int width,
-+                                unsigned int x_shift) {
-   for (int x = 0; x < width; x += 2) {
--    uint8 u = u_buf[x >> 1];
--    uint8 v = v_buf[x >> 1];
-+    uint8 u = u_buf[x >> x_shift];
-+    uint8 v = v_buf[x >> x_shift];
-     uint8 y0 = y_buf[x];
-     YuvPixel(y0, u, v, rgb_buf);
-     if ((x + 1) < width) {
-       uint8 y1 = y_buf[x + 1];
-+      if (x_shift == 0) {
-+        u = u_buf[x + 1];
-+        v = v_buf[x + 1];
-+      }
-       YuvPixel(y1, u, v, rgb_buf + 4);
-     }
-     rgb_buf += 8;  // Advance 2 pixels.
-   }
- }
- 
- // 16.16 fixed point is used.  A shift by 16 isolates the integer.
- // A shift by 17 is used to further subsample the chrominence channels.
- // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
- // for 1/65536 pixel accurate interpolation.
--void ScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx) {
-+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
-+                          const uint8* u_buf,
-+                          const uint8* v_buf,
-+                          uint8* rgb_buf,
-+                          int width,
-+                          int source_dx) {
-   int x = 0;
-   for (int i = 0; i < width; i += 2) {
-     int y = y_buf[x >> 16];
-     int u = u_buf[(x >> 17)];
-     int v = v_buf[(x >> 17)];
-     YuvPixel(y, u, v, rgb_buf);
-     x += source_dx;
-     if ((i + 1) < width) {
-       y = y_buf[x >> 16];
-       YuvPixel(y, u, v, rgb_buf+4);
-       x += source_dx;
-     }
-     rgb_buf += 8;
-   }
- }
- 
--void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width,
--                              int source_dx) {
-+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
-+                                const uint8* u_buf,
-+                                const uint8* v_buf,
-+                                uint8* rgb_buf,
-+                                int width,
-+                                int source_dx) {
-   int x = 0;
-   if (source_dx >= 0x20000) {
-     x = 32768;
-   }
-   for (int i = 0; i < width; i += 2) {
-     int y0 = y_buf[x >> 16];
-     int y1 = y_buf[(x >> 16) + 1];
-     int u0 = u_buf[(x >> 17)];
-@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
-       y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
-       YuvPixel(y, u, v, rgb_buf+4);
-       x += source_dx;
-     }
-     rgb_buf += 8;
-   }
- }
- 
--#endif  // USE_MMX
- }  // extern "C"
- 
-diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
---- a/gfx/ycbcr/yuv_row_posix.cpp
-+++ b/gfx/ycbcr/yuv_row_posix.cpp
-@@ -1,33 +1,32 @@
- // Copyright (c) 2010 The Chromium Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- 
--#include "media/base/yuv_row.h"
--
--#ifdef _DEBUG
--#include "base/logging.h"
--#else
-+#include "yuv_row.h"
-+#include "mozilla/SSE.h"
-+
- #define DCHECK(a)
--#endif
- 
- extern "C" {
- 
--#if USE_SSE2 && defined(ARCH_CPU_X86_64)
-+#if defined(ARCH_CPU_X86_64)
-+
-+// We don't need CPUID guards here, since x86-64 implies SSE2.
- 
- // AMD64 ABI uses register paremters.
- void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
-                               const uint8* u_buf,  // rsi
-                               const uint8* v_buf,  // rdx
-                               uint8* rgb_buf,      // rcx
-                               int width) {         // r8
-   asm(
--  "jmp    convertend\n"
--"convertloop:"
-+  "jmp    1f\n"
-+"0:"
-   "movzb  (%1),%%r10\n"
-   "add    $0x1,%1\n"
-   "movzb  (%2),%%r11\n"
-   "add    $0x1,%2\n"
-   "movq   2048(%5,%%r10,8),%%xmm0\n"
-   "movzb  (%0),%%r10\n"
-   "movq   4096(%5,%%r11,8),%%xmm1\n"
-   "movzb  0x1(%0),%%r11\n"
-@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
-   "movq   (%5,%%r11,8),%%xmm3\n"
-   "paddsw %%xmm0,%%xmm2\n"
-   "paddsw %%xmm0,%%xmm3\n"
-   "shufps $0x44,%%xmm3,%%xmm2\n"
-   "psraw  $0x6,%%xmm2\n"
-   "packuswb %%xmm2,%%xmm2\n"
-   "movq   %%xmm2,0x0(%3)\n"
-   "add    $0x8,%3\n"
--"convertend:"
-+"1:"
-   "sub    $0x2,%4\n"
--  "jns    convertloop\n"
--
--"convertnext:"
-+  "jns    0b\n"
-+
-+"2:"
-   "add    $0x1,%4\n"
--  "js     convertdone\n"
-+  "js     3f\n"
- 
-   "movzb  (%1),%%r10\n"
-   "movq   2048(%5,%%r10,8),%%xmm0\n"
-   "movzb  (%2),%%r10\n"
-   "movq   4096(%5,%%r10,8),%%xmm1\n"
-   "paddsw %%xmm1,%%xmm0\n"
-   "movzb  (%0),%%r10\n"
-   "movq   (%5,%%r10,8),%%xmm1\n"
-   "paddsw %%xmm0,%%xmm1\n"
-   "psraw  $0x6,%%xmm1\n"
-   "packuswb %%xmm1,%%xmm1\n"
-   "movd   %%xmm1,0x0(%3)\n"
--"convertdone:"
-+"3:"
-   :
-   : "r"(y_buf),  // %0
-     "r"(u_buf),  // %1
-     "r"(v_buf),  // %2
-     "r"(rgb_buf),  // %3
-     "r"(width),  // %4
-     "r" (kCoefficientsRgbY)  // %5
-   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
-                         const uint8* u_buf,  // rsi
-                         const uint8* v_buf,  // rdx
-                         uint8* rgb_buf,      // rcx
-                         int width,           // r8
-                         int source_dx) {     // r9
-   asm(
-   "xor    %%r11,%%r11\n"
-   "sub    $0x2,%4\n"
--  "js     scalenext\n"
--
--"scaleloop:"
-+  "js     1f\n"
-+
-+"0:"
-   "mov    %%r11,%%r10\n"
-   "sar    $0x11,%%r10\n"
-   "movzb  (%1,%%r10,1),%%rax\n"
-   "movq   2048(%5,%%rax,8),%%xmm0\n"
-   "movzb  (%2,%%r10,1),%%rax\n"
-   "movq   4096(%5,%%rax,8),%%xmm1\n"
-   "lea    (%%r11,%6),%%r10\n"
-   "sar    $0x10,%%r11\n"
-@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
-   "paddsw %%xmm0,%%xmm1\n"
-   "paddsw %%xmm0,%%xmm2\n"
-   "shufps $0x44,%%xmm2,%%xmm1\n"
-   "psraw  $0x6,%%xmm1\n"
-   "packuswb %%xmm1,%%xmm1\n"
-   "movq   %%xmm1,0x0(%3)\n"
-   "add    $0x8,%3\n"
-   "sub    $0x2,%4\n"
--  "jns    scaleloop\n"
--
--"scalenext:"
-+  "jns    0b\n"
-+
-+"1:"
-   "add    $0x1,%4\n"
--  "js     scaledone\n"
-+  "js     2f\n"
- 
-   "mov    %%r11,%%r10\n"
-   "sar    $0x11,%%r10\n"
-   "movzb  (%1,%%r10,1),%%rax\n"
-   "movq   2048(%5,%%rax,8),%%xmm0\n"
-   "movzb  (%2,%%r10,1),%%rax\n"
-   "movq   4096(%5,%%rax,8),%%xmm1\n"
-   "paddsw %%xmm1,%%xmm0\n"
-   "sar    $0x10,%%r11\n"
-   "movzb  (%0,%%r11,1),%%rax\n"
-   "movq   (%5,%%rax,8),%%xmm1\n"
-   "paddsw %%xmm0,%%xmm1\n"
-   "psraw  $0x6,%%xmm1\n"
-   "packuswb %%xmm1,%%xmm1\n"
-   "movd   %%xmm1,0x0(%3)\n"
- 
--"scaledone:"
-+"2:"
-   :
-   : "r"(y_buf),  // %0
-     "r"(u_buf),  // %1
-     "r"(v_buf),  // %2
-     "r"(rgb_buf),  // %3
-     "r"(width),  // %4
-     "r" (kCoefficientsRgbY),  // %5
-     "r"(static_cast<long>(source_dx))  // %6
-@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width,
-                               int source_dx) {
-   asm(
-   "xor    %%r11,%%r11\n"   // x = 0
-   "sub    $0x2,%4\n"
--  "js     .lscalenext\n"
-+  "js     2f\n"
-   "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
--  "jl     .lscalehalf\n"
-+  "jl     0f\n"
-   "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
--".lscalehalf:"
--
--".lscaleloop:"
-+"0:"
-+
-+"1:"
-   "mov    %%r11,%%r10\n"
-   "sar    $0x11,%%r10\n"
- 
-   "movzb  (%1, %%r10, 1), %%r13 \n"
-   "movzb  1(%1, %%r10, 1), %%r14 \n"
-   "mov    %%r11, %%rax \n"
-   "and    $0x1fffe, %%rax \n"
-   "imul   %%rax, %%r14 \n"
-@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
-   "paddsw %%xmm0,%%xmm1\n"
-   "paddsw %%xmm0,%%xmm2\n"
-   "shufps $0x44,%%xmm2,%%xmm1\n"
-   "psraw  $0x6,%%xmm1\n"
-   "packuswb %%xmm1,%%xmm1\n"
-   "movq   %%xmm1,0x0(%3)\n"
-   "add    $0x8,%3\n"
-   "sub    $0x2,%4\n"
--  "jns    .lscaleloop\n"
--
--".lscalenext:"
-+  "jns    1b\n"
-+
-+"2:"
-   "add    $0x1,%4\n"
--  "js     .lscaledone\n"
-+  "js     3f\n"
- 
-   "mov    %%r11,%%r10\n"
-   "sar    $0x11,%%r10\n"
- 
-   "movzb  (%1,%%r10,1), %%r13 \n"
-   "movq   2048(%5,%%r13,8),%%xmm0\n"
- 
-   "movzb  (%2,%%r10,1), %%r13 \n"
-@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
-   "movzb  (%0,%%r11,1), %%r13 \n"
-   "movq   (%5,%%r13,8),%%xmm1\n"
- 
-   "paddsw %%xmm0,%%xmm1\n"
-   "psraw  $0x6,%%xmm1\n"
-   "packuswb %%xmm1,%%xmm1\n"
-   "movd   %%xmm1,0x0(%3)\n"
- 
--".lscaledone:"
-+"3:"
-   :
-   : "r"(y_buf),  // %0
-     "r"(u_buf),  // %1
-     "r"(v_buf),  // %2
-     "r"(rgb_buf),  // %3
-     "r"(width),  // %4
-     "r" (kCoefficientsRgbY),  // %5
-     "r"(static_cast<long>(source_dx))  // %6
-   : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
- );
- }
- 
--#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
-+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
- 
- // PIC version is slower because less registers are available, so
- // non-PIC is used on platforms where it is possible.
--
--void FastConvertYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width);
-+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                  const uint8* u_buf,
-+                                  const uint8* v_buf,
-+                                  uint8* rgb_buf,
-+                                  int width);
-   asm(
-   ".text\n"
--  ".global FastConvertYUVToRGB32Row\n"
--"FastConvertYUVToRGB32Row:\n"
-+  ".global FastConvertYUVToRGB32Row_SSE\n"
-+  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
-+"FastConvertYUVToRGB32Row_SSE:\n"
-   "pusha\n"
-   "mov    0x24(%esp),%edx\n"
-   "mov    0x28(%esp),%edi\n"
-   "mov    0x2c(%esp),%esi\n"
-   "mov    0x30(%esp),%ebp\n"
-   "mov    0x34(%esp),%ecx\n"
--  "jmp    convertend\n"
--
--"convertloop:"
-+  "jmp    1f\n"
-+
-+"0:"
-   "movzbl (%edi),%eax\n"
-   "add    $0x1,%edi\n"
-   "movzbl (%esi),%ebx\n"
-   "add    $0x1,%esi\n"
-   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-   "movzbl (%edx),%eax\n"
-   "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-   "movzbl 0x1(%edx),%ebx\n"
-@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
-   "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
-   "paddsw %mm0,%mm1\n"
-   "paddsw %mm0,%mm2\n"
-   "psraw  $0x6,%mm1\n"
-   "psraw  $0x6,%mm2\n"
-   "packuswb %mm2,%mm1\n"
-   "movntq %mm1,0x0(%ebp)\n"
-   "add    $0x8,%ebp\n"
--"convertend:"
-+"1:"
-   "sub    $0x2,%ecx\n"
--  "jns    convertloop\n"
-+  "jns    0b\n"
- 
-   "and    $0x1,%ecx\n"
--  "je     convertdone\n"
-+  "je     2f\n"
- 
-   "movzbl (%edi),%eax\n"
-   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-   "movzbl (%esi),%eax\n"
-   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
-   "movzbl (%edx),%eax\n"
-   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
-   "paddsw %mm0,%mm1\n"
-   "psraw  $0x6,%mm1\n"
-   "packuswb %mm1,%mm1\n"
-   "movd   %mm1,0x0(%ebp)\n"
--"convertdone:"
-+"2:"
-   "popa\n"
-   "ret\n"
-+#if !defined(XP_MACOSX)
-+  ".previous\n"
-+#endif
- );
- 
--
--void ScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx);
-+void FastConvertYUVToRGB32Row(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width)
-+{
-+  if (mozilla::supports_sse()) {
-+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
-+    return;
-+  }
-+
-+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
-+}
-+
-+
-+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                            const uint8* u_buf,
-+                            const uint8* v_buf,
-+                            uint8* rgb_buf,
-+                            int width,
-+                            int source_dx);
-   asm(
-   ".text\n"
--  ".global ScaleYUVToRGB32Row\n"
--"ScaleYUVToRGB32Row:\n"
-+  ".global ScaleYUVToRGB32Row_SSE\n"
-+  ".type ScaleYUVToRGB32Row_SSE, @function\n"
-+"ScaleYUVToRGB32Row_SSE:\n"
-   "pusha\n"
-   "mov    0x24(%esp),%edx\n"
-   "mov    0x28(%esp),%edi\n"
-   "mov    0x2c(%esp),%esi\n"
-   "mov    0x30(%esp),%ebp\n"
-   "mov    0x34(%esp),%ecx\n"
-   "xor    %ebx,%ebx\n"
--  "jmp    scaleend\n"
--
--"scaleloop:"
-+  "jmp    1f\n"
-+
-+"0:"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%edi,%eax,1),%eax\n"
-   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%esi,%eax,1),%eax\n"
-   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
-@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
-   "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
-   "paddsw %mm0,%mm1\n"
-   "paddsw %mm0,%mm2\n"
-   "psraw  $0x6,%mm1\n"
-   "psraw  $0x6,%mm2\n"
-   "packuswb %mm2,%mm1\n"
-   "movntq %mm1,0x0(%ebp)\n"
-   "add    $0x8,%ebp\n"
--"scaleend:"
-+"1:"
-   "sub    $0x2,%ecx\n"
--  "jns    scaleloop\n"
-+  "jns    0b\n"
- 
-   "and    $0x1,%ecx\n"
--  "je     scaledone\n"
-+  "je     2f\n"
- 
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%edi,%eax,1),%eax\n"
-   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%esi,%eax,1),%eax\n"
-@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
-   "sar    $0x10,%eax\n"
-   "movzbl (%edx,%eax,1),%eax\n"
-   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
-   "paddsw %mm0,%mm1\n"
-   "psraw  $0x6,%mm1\n"
-   "packuswb %mm1,%mm1\n"
-   "movd   %mm1,0x0(%ebp)\n"
- 
--"scaledone:"
-+"2:"
-   "popa\n"
-   "ret\n"
-+#if !defined(XP_MACOSX)
-+  ".previous\n"
-+#endif
- );
- 
--void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width,
--                              int source_dx);
-+void ScaleYUVToRGB32Row(const uint8* y_buf,
-+                        const uint8* u_buf,
-+                        const uint8* v_buf,
-+                        uint8* rgb_buf,
-+                        int width,
-+                        int source_dx)
-+{
-+  if (mozilla::supports_sse()) {
-+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
-+                           width, source_dx);
-+  }
-+
-+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
-+                       width, source_dx);
-+}
-+
-+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                  const uint8* u_buf,
-+                                  const uint8* v_buf,
-+                                  uint8* rgb_buf,
-+                                  int width,
-+                                  int source_dx);
-   asm(
-   ".text\n"
--  ".global LinearScaleYUVToRGB32Row\n"
--"LinearScaleYUVToRGB32Row:\n"
-+  ".global LinearScaleYUVToRGB32Row_SSE\n"
-+  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
-+"LinearScaleYUVToRGB32Row_SSE:\n"
-   "pusha\n"
-   "mov    0x24(%esp),%edx\n"
-   "mov    0x28(%esp),%edi\n"
-   "mov    0x30(%esp),%ebp\n"
- 
-   // source_width = width * source_dx + ebx
-   "mov    0x34(%esp), %ecx\n"
-   "imull  0x38(%esp), %ecx\n"
-   "mov    %ecx, 0x34(%esp)\n"
- 
-   "mov    0x38(%esp), %ecx\n"
-   "xor    %ebx,%ebx\n"     // x = 0
-   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
--  "jl     .lscaleend\n"
-+  "jl     1f\n"
-   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
--  "jmp    .lscaleend\n"
--
--".lscaleloop:"
--  "mov    %ebx,%eax\n"
--  "sar    $0x11,%eax\n"
-+  "jmp    1f\n"
-+
-+"0:"
-+  "mov    %ebx,%eax\n"
-+  "sar    $0x11,%eax\n"
- 
-   "movzbl (%edi,%eax,1),%ecx\n"
-   "movzbl 1(%edi,%eax,1),%esi\n"
-   "mov    %ebx,%eax\n"
-   "andl   $0x1fffe, %eax \n"
-   "imul   %eax, %esi \n"
-   "xorl   $0x1fffe, %eax \n"
-   "imul   %eax, %ecx \n"
-@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
-   "imul   %eax, %esi \n"
-   "xorl   $0xffff, %eax \n"
-   "imul   %eax, %ecx \n"
-   "addl   %esi, %ecx \n"
-   "shrl   $16, %ecx \n"
-   "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
- 
-   "cmp    0x34(%esp), %ebx\n"
--  "jge    .lscalelastpixel\n"
-+  "jge    2f\n"
- 
-   "mov    %ebx,%eax\n"
-   "sar    $0x10,%eax\n"
-   "movzbl (%edx,%eax,1),%ecx\n"
-   "movzbl 1(%edx,%eax,1),%esi\n"
-   "mov    %ebx,%eax\n"
-   "add    0x38(%esp),%ebx\n"
-   "andl   $0xffff, %eax \n"
-@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
-   "paddsw %mm0,%mm1\n"
-   "paddsw %mm0,%mm2\n"
-   "psraw  $0x6,%mm1\n"
-   "psraw  $0x6,%mm2\n"
-   "packuswb %mm2,%mm1\n"
-   "movntq %mm1,0x0(%ebp)\n"
-   "add    $0x8,%ebp\n"
- 
--".lscaleend:"
-+"1:"
-   "cmp    0x34(%esp), %ebx\n"
--  "jl     .lscaleloop\n"
-+  "jl     0b\n"
-   "popa\n"
-   "ret\n"
- 
--".lscalelastpixel:"
-+"2:"
-   "paddsw %mm0, %mm1\n"
-   "psraw $6, %mm1\n"
-   "packuswb %mm1, %mm1\n"
-   "movd %mm1, (%ebp)\n"
-   "popa\n"
-   "ret\n"
-+#if !defined(XP_MACOSX)
-+  ".previous\n"
-+#endif
- );
- 
--#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
--
--extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
--                                    const uint8* u_buf,
--                                    const uint8* v_buf,
--                                    uint8* rgb_buf,
--                                    int width,
--                                    int16 *kCoefficientsRgbY);
-+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int source_dx)
-+{
-+  if (mozilla::supports_sse()) {
-+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
-+                                 width, source_dx);
-+  }
-+
-+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
-+                             width, source_dx);
-+}
-+
-+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
-+
-+void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                 const uint8* u_buf,
-+                                 const uint8* v_buf,
-+                                 uint8* rgb_buf,
-+                                 int width,
-+                                 int16 *kCoefficientsRgbY);
-+
-   asm(
-   ".text\n"
--#if defined(OS_MACOSX)
--"_PICConvertYUVToRGB32Row:\n"
-+#if defined(XP_MACOSX)
-+"_PICConvertYUVToRGB32Row_SSE:\n"
- #else
--"PICConvertYUVToRGB32Row:\n"
-+"PICConvertYUVToRGB32Row_SSE:\n"
- #endif
-   "pusha\n"
-   "mov    0x24(%esp),%edx\n"
-   "mov    0x28(%esp),%edi\n"
-   "mov    0x2c(%esp),%esi\n"
-   "mov    0x30(%esp),%ebp\n"
-   "mov    0x38(%esp),%ecx\n"
- 
--  "jmp    .Lconvertend\n"
--
--".Lconvertloop:"
-+  "jmp    1f\n"
-+
-+"0:"
-   "movzbl (%edi),%eax\n"
-   "add    $0x1,%edi\n"
-   "movzbl (%esi),%ebx\n"
-   "add    $0x1,%esi\n"
-   "movq   2048(%ecx,%eax,8),%mm0\n"
-   "movzbl (%edx),%eax\n"
-   "paddsw 4096(%ecx,%ebx,8),%mm0\n"
-   "movzbl 0x1(%edx),%ebx\n"
-@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
-   "movq   0(%ecx,%ebx,8),%mm2\n"
-   "paddsw %mm0,%mm1\n"
-   "paddsw %mm0,%mm2\n"
-   "psraw  $0x6,%mm1\n"
-   "psraw  $0x6,%mm2\n"
-   "packuswb %mm2,%mm1\n"
-   "movntq %mm1,0x0(%ebp)\n"
-   "add    $0x8,%ebp\n"
--".Lconvertend:"
-+"1:"
-   "subl   $0x2,0x34(%esp)\n"
--  "jns    .Lconvertloop\n"
-+  "jns    0b\n"
- 
-   "andl   $0x1,0x34(%esp)\n"
--  "je     .Lconvertdone\n"
-+  "je     2f\n"
- 
-   "movzbl (%edi),%eax\n"
-   "movq   2048(%ecx,%eax,8),%mm0\n"
-   "movzbl (%esi),%eax\n"
-   "paddsw 4096(%ecx,%eax,8),%mm0\n"
-   "movzbl (%edx),%eax\n"
-   "movq   0(%ecx,%eax,8),%mm1\n"
-   "paddsw %mm0,%mm1\n"
-   "psraw  $0x6,%mm1\n"
-   "packuswb %mm1,%mm1\n"
-   "movd   %mm1,0x0(%ebp)\n"
--".Lconvertdone:\n"
-+"2:"
-   "popa\n"
-   "ret\n"
-+#if !defined(XP_MACOSX)
-+  ".previous\n"
-+#endif
- );
- 
- void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
--                              int width) {
--  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
--                          &kCoefficientsRgbY[0][0]);
--}
--
--extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
-+                              int width)
-+{
-+  if (mozilla::supports_sse()) {
-+    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
-+                                &kCoefficientsRgbY[0][0]);
-+    return;
-+  }
-+
-+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
-+}
-+
-+void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* rgb_buf,
-                                int width,
-                                int source_dx,
-                                int16 *kCoefficientsRgbY);
- 
-   asm(
-   ".text\n"
--#if defined(OS_MACOSX)
--"_PICScaleYUVToRGB32Row:\n"
-+#if defined(XP_MACOSX)
-+"_PICScaleYUVToRGB32Row_SSE:\n"
- #else
--"PICScaleYUVToRGB32Row:\n"
-+"PICScaleYUVToRGB32Row_SSE:\n"
- #endif
-   "pusha\n"
-   "mov    0x24(%esp),%edx\n"
-   "mov    0x28(%esp),%edi\n"
-   "mov    0x2c(%esp),%esi\n"
-   "mov    0x30(%esp),%ebp\n"
-   "mov    0x3c(%esp),%ecx\n"
-   "xor    %ebx,%ebx\n"
--  "jmp    Lscaleend\n"
--
--"Lscaleloop:"
-+  "jmp    1f\n"
-+
-+"0:"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%edi,%eax,1),%eax\n"
-   "movq   2048(%ecx,%eax,8),%mm0\n"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%esi,%eax,1),%eax\n"
-   "paddsw 4096(%ecx,%eax,8),%mm0\n"
-@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const 
-   "movq   0(%ecx,%eax,8),%mm2\n"
-   "paddsw %mm0,%mm1\n"
-   "paddsw %mm0,%mm2\n"
-   "psraw  $0x6,%mm1\n"
-   "psraw  $0x6,%mm2\n"
-   "packuswb %mm2,%mm1\n"
-   "movntq %mm1,0x0(%ebp)\n"
-   "add    $0x8,%ebp\n"
--"Lscaleend:"
-+"1:"
-   "subl   $0x2,0x34(%esp)\n"
--  "jns    Lscaleloop\n"
-+  "jns    0b\n"
- 
-   "andl   $0x1,0x34(%esp)\n"
--  "je     Lscaledone\n"
-+  "je     2f\n"
- 
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%edi,%eax,1),%eax\n"
-   "movq   2048(%ecx,%eax,8),%mm0\n"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
-   "movzbl (%esi,%eax,1),%eax\n"
-@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const 
-   "sar    $0x10,%eax\n"
-   "movzbl (%edx,%eax,1),%eax\n"
-   "movq   0(%ecx,%eax,8),%mm1\n"
-   "paddsw %mm0,%mm1\n"
-   "psraw  $0x6,%mm1\n"
-   "packuswb %mm1,%mm1\n"
-   "movd   %mm1,0x0(%ebp)\n"
- 
--"Lscaledone:"
-+"2:"
-   "popa\n"
-   "ret\n"
-+#if !defined(XP_MACOSX)
-+  ".previous\n"
-+#endif
- );
- 
--
- void ScaleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width,
--                        int source_dx) {
--  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
--                        &kCoefficientsRgbY[0][0]);
--}
--
--void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
--                                 const uint8* u_buf,
--                                 const uint8* v_buf,
--                                 uint8* rgb_buf,
--                                 int width,
--                                 int source_dx,
--                                 int16 *kCoefficientsRgbY);
-+                        int source_dx)
-+{
-+  if (mozilla::supports_sse()) {
-+    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-+                              &kCoefficientsRgbY[0][0]);
-+    return;
-+  }
-+
-+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
-+
-+void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                     const uint8* u_buf,
-+                                     const uint8* v_buf,
-+                                     uint8* rgb_buf,
-+                                     int width,
-+                                     int source_dx,
-+                                     int16 *kCoefficientsRgbY);
-+
-   asm(
-   ".text\n"
--#if defined(OS_MACOSX)
--"_PICLinearScaleYUVToRGB32Row:\n"
-+#if defined(XP_MACOSX)
-+"_PICLinearScaleYUVToRGB32Row_SSE:\n"
- #else
--"PICLinearScaleYUVToRGB32Row:\n"
-+"PICLinearScaleYUVToRGB32Row_SSE:\n"
- #endif
-   "pusha\n"
-   "mov    0x24(%esp),%edx\n"
-   "mov    0x30(%esp),%ebp\n"
-   "mov    0x34(%esp),%ecx\n"
-   "mov    0x3c(%esp),%edi\n"
-   "xor    %ebx,%ebx\n"
- 
-   // source_width = width * source_dx + ebx
-   "mov    0x34(%esp), %ecx\n"
-   "imull  0x38(%esp), %ecx\n"
-   "mov    %ecx, 0x34(%esp)\n"
- 
-   "mov    0x38(%esp), %ecx\n"
-   "xor    %ebx,%ebx\n"     // x = 0
-   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
--  "jl     .lscaleend\n"
-+  "jl     1f\n"
-   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
--  "jmp    .lscaleend\n"
--
--".lscaleloop:"
-+  "jmp    1f\n"
-+
-+"0:"
-   "mov    0x28(%esp),%esi\n"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
- 
-   "movzbl (%esi,%eax,1),%ecx\n"
-   "movzbl 1(%esi,%eax,1),%esi\n"
-   "mov    %ebx,%eax\n"
-   "andl   $0x1fffe, %eax \n"
-@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
-   "imul   %eax, %esi \n"
-   "xorl   $0xffff, %eax \n"
-   "imul   %eax, %ecx \n"
-   "addl   %esi, %ecx \n"
-   "shrl   $16, %ecx \n"
-   "movq   (%edi,%ecx,8),%mm1\n"
- 
-   "cmp    0x34(%esp), %ebx\n"
--  "jge    .lscalelastpixel\n"
-+  "jge    2f\n"
- 
-   "mov    %ebx,%eax\n"
-   "sar    $0x10,%eax\n"
-   "movzbl (%edx,%eax,1),%ecx\n"
-   "movzbl 1(%edx,%eax,1),%esi\n"
-   "mov    %ebx,%eax\n"
-   "add    0x38(%esp),%ebx\n"
-   "andl   $0xffff, %eax \n"
-@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
-   "paddsw %mm0,%mm1\n"
-   "paddsw %mm0,%mm2\n"
-   "psraw  $0x6,%mm1\n"
-   "psraw  $0x6,%mm2\n"
-   "packuswb %mm2,%mm1\n"
-   "movntq %mm1,0x0(%ebp)\n"
-   "add    $0x8,%ebp\n"
- 
--".lscaleend:"
-+"1:"
-   "cmp    %ebx, 0x34(%esp)\n"
--  "jg     .lscaleloop\n"
-+  "jg     0b\n"
-   "popa\n"
-   "ret\n"
- 
--".lscalelastpixel:"
-+"2:"
-   "paddsw %mm0, %mm1\n"
-   "psraw $6, %mm1\n"
-   "packuswb %mm1, %mm1\n"
-   "movd %mm1, (%ebp)\n"
-   "popa\n"
-   "ret\n"
-+#if !defined(XP_MACOSX)
-+  ".previous\n"
-+#endif
- );
- 
-+
- void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx) {
--  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
--                              &kCoefficientsRgbY[0][0]);
--}
--
--#else  // USE_MMX
--
--// C reference code that mimic the YUV assembly.
--#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
--#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
--    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
--
--static inline void YuvPixel(uint8 y,
--                            uint8 u,
--                            uint8 v,
--                            uint8* rgb_buf) {
--
--  int b = kCoefficientsRgbY[256+u][0];
--  int g = kCoefficientsRgbY[256+u][1];
--  int r = kCoefficientsRgbY[256+u][2];
--  int a = kCoefficientsRgbY[256+u][3];
--
--  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
--  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
--  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
--  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
--
--  b = paddsw(b, kCoefficientsRgbY[y][0]);
--  g = paddsw(g, kCoefficientsRgbY[y][1]);
--  r = paddsw(r, kCoefficientsRgbY[y][2]);
--  a = paddsw(a, kCoefficientsRgbY[y][3]);
--
--  b >>= 6;
--  g >>= 6;
--  r >>= 6;
--  a >>= 6;
--
--  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
--                                        (packuswb(g) << 8) |
--                                        (packuswb(r) << 16) |
--                                        (packuswb(a) << 24);
--}
--
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int source_dx)
-+{
-+  if (mozilla::supports_sse()) {
-+    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
-+                                    source_dx, &kCoefficientsRgbY[0][0]);
-+    return;
-+  }
-+
-+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
-+#else
- void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width) {
--  for (int x = 0; x < width; x += 2) {
--    uint8 u = u_buf[x >> 1];
--    uint8 v = v_buf[x >> 1];
--    uint8 y0 = y_buf[x];
--    YuvPixel(y0, u, v, rgb_buf);
--    if ((x + 1) < width) {
--      uint8 y1 = y_buf[x + 1];
--      YuvPixel(y1, u, v, rgb_buf + 4);
--    }
--    rgb_buf += 8;  // Advance 2 pixels.
--  }
--}
--
--// 16.16 fixed point is used.  A shift by 16 isolates the integer.
--// A shift by 17 is used to further subsample the chrominence channels.
--// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
--// for 1/65536 pixel accurate interpolation.
-+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
-+}
-+
- void ScaleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width,
-                         int source_dx) {
--  int x = 0;
--  for (int i = 0; i < width; i += 2) {
--    int y = y_buf[x >> 16];
--    int u = u_buf[(x >> 17)];
--    int v = v_buf[(x >> 17)];
--    YuvPixel(y, u, v, rgb_buf);
--    x += source_dx;
--    if ((i + 1) < width) {
--      y = y_buf[x >> 16];
--      YuvPixel(y, u, v, rgb_buf+4);
--      x += source_dx;
--    }
--    rgb_buf += 8;
--  }
--}
-+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
- 
- void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width,
-                               int source_dx) {
--  int x = 0;
--  if (source_dx >= 0x20000) {
--    x = 32768;
--  }
--  for (int i = 0; i < width; i += 2) {
--    int y0 = y_buf[x >> 16];
--    int y1 = y_buf[(x >> 16) + 1];
--    int u0 = u_buf[(x >> 17)];
--    int u1 = u_buf[(x >> 17) + 1];
--    int v0 = v_buf[(x >> 17)];
--    int v1 = v_buf[(x >> 17) + 1];
--    int y_frac = (x & 65535);
--    int uv_frac = ((x >> 1) & 65535);
--    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
--    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
--    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
--    YuvPixel(y, u, v, rgb_buf);
--    x += source_dx;
--    if ((i + 1) < width) {
--      y0 = y_buf[x >> 16];
--      y1 = y_buf[(x >> 16) + 1];
--      y_frac = (x & 65535);
--      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
--      YuvPixel(y, u, v, rgb_buf+4);
--      x += source_dx;
--    }
--    rgb_buf += 8;
--  }
--}
--
--#endif  // USE_MMX
--}  // extern "C"
--
-+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
-+#endif
-+
-+}
-diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
---- a/gfx/ycbcr/yuv_row_table.cpp
-+++ b/gfx/ycbcr/yuv_row_table.cpp
-@@ -1,13 +1,13 @@
- // Copyright (c) 2010 The Chromium Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- 
--#include "media/base/yuv_row.h"
-+#include "yuv_row.h"
- 
- extern "C" {
- 
- #define RGBY(i) { \
-   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-   0 \
-diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
---- a/gfx/ycbcr/yuv_row_win.cpp
-+++ b/gfx/ycbcr/yuv_row_win.cpp
-@@ -1,26 +1,27 @@
- // Copyright (c) 2010 The Chromium Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- 
--#include "media/base/yuv_row.h"
-+#include "yuv_row.h"
-+#include "mozilla/SSE.h"
- 
- #define kCoefficientsRgbU kCoefficientsRgbY + 2048
- #define kCoefficientsRgbV kCoefficientsRgbY + 4096
- 
- extern "C" {
- 
--#if USE_MMX
--__declspec(naked)
--void FastConvertYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width) {
-+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
-+__declspec(naked)
-+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                  const uint8* u_buf,
-+                                  const uint8* v_buf,
-+                                  uint8* rgb_buf,
-+                                  int width) {
-   __asm {
-     pushad
-     mov       edx, [esp + 32 + 4]   // Y
-     mov       edi, [esp + 32 + 8]   // U
-     mov       esi, [esp + 32 + 12]  // V
-     mov       ebp, [esp + 32 + 16]  // rgb
-     mov       ecx, [esp + 32 + 20]  // width
-     jmp       convertend
-@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
-  convertdone :
- 
-     popad
-     ret
-   }
- }
- 
- __declspec(naked)
--void ConvertYUVToRGB32Row(const uint8* y_buf,
--                          const uint8* u_buf,
--                          const uint8* v_buf,
--                          uint8* rgb_buf,
--                          int width,
--                          int step) {
-+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int step) {
-   __asm {
-     pushad
-     mov       edx, [esp + 32 + 4]   // Y
-     mov       edi, [esp + 32 + 8]   // U
-     mov       esi, [esp + 32 + 12]  // V
-     mov       ebp, [esp + 32 + 16]  // rgb
-     mov       ecx, [esp + 32 + 20]  // width
-     mov       ebx, [esp + 32 + 24]  // step
-@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
-  wdone :
- 
-     popad
-     ret
-   }
- }
- 
- __declspec(naked)
--void RotateConvertYUVToRGB32Row(const uint8* y_buf,
--                                const uint8* u_buf,
--                                const uint8* v_buf,
--                                uint8* rgb_buf,
--                                int width,
--                                int ystep,
--                                int uvstep) {
-+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                    const uint8* u_buf,
-+                                    const uint8* v_buf,
-+                                    uint8* rgb_buf,
-+                                    int width,
-+                                    int ystep,
-+                                    int uvstep) {
-   __asm {
-     pushad
-     mov       edx, [esp + 32 + 4]   // Y
-     mov       edi, [esp + 32 + 8]   // U
-     mov       esi, [esp + 32 + 12]  // V
-     mov       ebp, [esp + 32 + 16]  // rgb
-     mov       ecx, [esp + 32 + 20]  // width
-     jmp       wend
-@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
-  wdone :
- 
-     popad
-     ret
-   }
- }
- 
- __declspec(naked)
--void DoubleYUVToRGB32Row(const uint8* y_buf,
--                         const uint8* u_buf,
--                         const uint8* v_buf,
--                         uint8* rgb_buf,
--                         int width) {
-+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                             const uint8* u_buf,
-+                             const uint8* v_buf,
-+                             uint8* rgb_buf,
-+                             int width) {
-   __asm {
-     pushad
-     mov       edx, [esp + 32 + 4]   // Y
-     mov       edi, [esp + 32 + 8]   // U
-     mov       esi, [esp + 32 + 12]  // V
-     mov       ebp, [esp + 32 + 16]  // rgb
-     mov       ecx, [esp + 32 + 20]  // width
-     jmp       wend
-@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
-     jns       wloop1
-  wdone :
-     popad
-     ret
-   }
- }
- 
- // This version does general purpose scaling by any amount, up or down.
--// The only thing it can not do it rotation by 90 or 270.
--// For performance the chroma is under sampled, reducing cost of a 3x
-+// The only thing it cannot do is rotation by 90 or 270.
-+// For performance the chroma is under-sampled, reducing cost of a 3x
- // 1080p scale from 8.4 ms to 5.4 ms.
- __declspec(naked)
--void ScaleYUVToRGB32Row(const uint8* y_buf,
--                        const uint8* u_buf,
--                        const uint8* v_buf,
--                        uint8* rgb_buf,
--                        int width,
--                        int source_dx) {
-+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                            const uint8* u_buf,
-+                            const uint8* v_buf,
-+                            uint8* rgb_buf,
-+                            int width,
-+                            int source_dx) {
-   __asm {
-     pushad
-     mov       edx, [esp + 32 + 4]   // Y
-     mov       edi, [esp + 32 + 8]   // U
-     mov       esi, [esp + 32 + 12]  // V
-     mov       ebp, [esp + 32 + 16]  // rgb
-     mov       ecx, [esp + 32 + 20]  // width
-     xor       ebx, ebx              // x
-@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
- 
-  scaledone :
-     popad
-     ret
-   }
- }
- 
- __declspec(naked)
--void LinearScaleYUVToRGB32Row(const uint8* y_buf,
--                              const uint8* u_buf,
--                              const uint8* v_buf,
--                              uint8* rgb_buf,
--                              int width,
--                              int source_dx) {
-+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
-+                                  const uint8* u_buf,
-+                                  const uint8* v_buf,
-+                                  uint8* rgb_buf,
-+                                  int width,
-+                                  int source_dx) {
-   __asm {
-     pushad
-     mov       edx, [esp + 32 + 4]  // Y
-     mov       edi, [esp + 32 + 8]  // U
-                 // [esp + 32 + 12] // V
-     mov       ebp, [esp + 32 + 16] // rgb
-     mov       ecx, [esp + 32 + 20] // width
-     imul      ecx, [esp + 32 + 24] // source_dx
-@@ -438,152 +439,60 @@ lscalelastpixel:
-     paddsw    mm1, mm0
-     psraw     mm1, 6
-     packuswb  mm1, mm1
-     movd      [ebp], mm1
-     popad
-     ret
-   };
- }
--#else  // USE_MMX
--
--// C reference code that mimic the YUV assembly.
--#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
--#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
--    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
--
--static inline void YuvPixel(uint8 y,
--                            uint8 u,
--                            uint8 v,
--                            uint8* rgb_buf) {
--
--  int b = kCoefficientsRgbY[256+u][0];
--  int g = kCoefficientsRgbY[256+u][1];
--  int r = kCoefficientsRgbY[256+u][2];
--  int a = kCoefficientsRgbY[256+u][3];
--
--  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
--  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
--  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
--  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
--
--  b = paddsw(b, kCoefficientsRgbY[y][0]);
--  g = paddsw(g, kCoefficientsRgbY[y][1]);
--  r = paddsw(r, kCoefficientsRgbY[y][2]);
--  a = paddsw(a, kCoefficientsRgbY[y][3]);
--
--  b >>= 6;
--  g >>= 6;
--  r >>= 6;
--  a >>= 6;
--
--  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
--                                        (packuswb(g) << 8) |
--                                        (packuswb(r) << 16) |
--                                        (packuswb(a) << 24);
--}
--
--#if TEST_MMX_YUV
--static inline void YuvPixel(uint8 y,
--                            uint8 u,
--                            uint8 v,
--                            uint8* rgb_buf) {
--
--  __asm {
--    movzx     eax, u
--    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]
--    movzx     eax, v
--    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]
--    movzx     eax, y
--    movq      mm1, [kCoefficientsRgbY + 8 * eax]
--    paddsw    mm1, mm0
--    psraw     mm1, 6
--    packuswb  mm1, mm1
--    mov       eax, rgb_buf
--    movd      [eax], mm1
--    emms
--  }
--}
--#endif
-+#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
- 
- void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width) {
--  for (int x = 0; x < width; x += 2) {
--    uint8 u = u_buf[x >> 1];
--    uint8 v = v_buf[x >> 1];
--    uint8 y0 = y_buf[x];
--    YuvPixel(y0, u, v, rgb_buf);
--    if ((x + 1) < width) {
--      uint8 y1 = y_buf[x + 1];
--      YuvPixel(y1, u, v, rgb_buf + 4);
--    }
--    rgb_buf += 8;  // Advance 2 pixels.
--  }
--}
--
--// 16.16 fixed point is used.  A shift by 16 isolates the integer.
--// A shift by 17 is used to further subsample the chrominence channels.
--// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
--// for 1/65536 pixel accurate interpolation.
-+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
-+  if (mozilla::supports_sse()) {
-+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
-+    return;
-+  }
-+#endif
-+
-+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
-+}
-+
- void ScaleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width,
-                         int source_dx) {
--  int x = 0;
--  for (int i = 0; i < width; i += 2) {
--    int y = y_buf[x >> 16];
--    int u = u_buf[(x >> 17)];
--    int v = v_buf[(x >> 17)];
--    YuvPixel(y, u, v, rgb_buf);
--    x += source_dx;
--    if ((i + 1) < width) {
--      y = y_buf[x >> 16];
--      YuvPixel(y, u, v, rgb_buf+4);
--      x += source_dx;
--    }
--    rgb_buf += 8;
--  }
--}
-+
-+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
-+  if (mozilla::supports_sse()) {
-+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+    return;
-+  }
-+#endif
-+
-+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
- 
- void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width,
-                               int source_dx) {
--  int x = 0;
--  if (source_dx >= 0x20000) {
--    x = 32768;
--  }
--  for (int i = 0; i < width; i += 2) {
--    int y0 = y_buf[x >> 16];
--    int y1 = y_buf[(x >> 16) + 1];
--    int u0 = u_buf[(x >> 17)];
--    int u1 = u_buf[(x >> 17) + 1];
--    int v0 = v_buf[(x >> 17)];
--    int v1 = v_buf[(x >> 17) + 1];
--    int y_frac = (x & 65535);
--    int uv_frac = ((x >> 1) & 65535);
--    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
--    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
--    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
--    YuvPixel(y, u, v, rgb_buf);
--    x += source_dx;
--    if ((i + 1) < width) {
--      y0 = y_buf[x >> 16];
--      y1 = y_buf[(x >> 16) + 1];
--      y_frac = (x & 65535);
--      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
--      YuvPixel(y, u, v, rgb_buf+4);
--      x += source_dx;
--    }
--    rgb_buf += 8;
--  }
--}
--
--#endif  // USE_MMX
--}  // extern "C"
--
-+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
-+  if (mozilla::supports_sse()) {
-+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
-+                                 source_dx);
-+    return;
-+  }
-+#endif
-+
-+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
-+
-+} // extern "C"
deleted file mode 100644
--- a/gfx/ycbcr/update.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-# update.sh <chromium-src-directory>
-cp $1/media/base/yuv_convert.h .
-cp $1/media/base/yuv_convert.cc yuv_convert.cpp
-cp $1/media/base/yuv_row.h .
-cp $1/media/base/yuv_row_table.cc yuv_row_table.cpp
-cp $1/media/base/yuv_row_posix.cc yuv_row_posix.cpp
-cp $1/media/base/yuv_row_win.cc yuv_row_win.cpp
-cp $1/media/base/yuv_row_posix.cc yuv_row_c.cpp
-patch -p3 <convert.patch
-patch -p3 <win64.patch
-patch -p3 <TypeFromSize.patch
-patch -p3 <QuellGccWarnings.patch
-patch -p3 <clang-cl-workaround.patch
deleted file mode 100644
--- a/gfx/ycbcr/win64.patch
+++ /dev/null
@@ -1,210 +0,0 @@
-diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
-new file mode 100644
---- /dev/null
-+++ b/gfx/ycbcr/yuv_row_win64.cpp
-@@ -0,0 +1,205 @@
-+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-+// Use of this source code is governed by a BSD-style license that can be
-+// found in the LICENSE file.
-+
-+#include "yuv_row.h"
-+
-+extern "C" {
-+
-+// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
-+
-+#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
-+#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
-+
-+#include <emmintrin.h>
-+
-+static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
-+                                          const uint8* u_buf,
-+                                          const uint8* v_buf,
-+                                          uint8* rgb_buf,
-+                                          int width) {
-+  __m128i xmm0, xmmY1, xmmY2;
-+  __m128  xmmY;
-+
-+  while (width >= 2) {
-+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
-+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
-+
-+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
-+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
-+
-+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
-+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
-+
-+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
-+                          0x44);
-+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
-+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
-+
-+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
-+    rgb_buf += 8;
-+    width -= 2;
-+  }
-+
-+  if (width) {
-+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
-+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
-+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
-+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
-+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
-+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
-+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
-+  }
-+}
-+
-+static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
-+                                    const uint8* u_buf,
-+                                    const uint8* v_buf,
-+                                    uint8* rgb_buf,
-+                                    int width,
-+                                    int source_dx) {
-+  __m128i xmm0, xmmY1, xmmY2;
-+  __m128  xmmY;
-+  uint8 u, v, y;
-+  int x = 0;
-+
-+  while (width >= 2) {
-+    u = u_buf[x >> 17];
-+    v = v_buf[x >> 17];
-+    y = y_buf[x >> 16];
-+    x += source_dx;
-+
-+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
-+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
-+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
-+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
-+
-+    y = y_buf[x >> 16];
-+    x += source_dx;
-+
-+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
-+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
-+
-+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
-+                          0x44);
-+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
-+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
-+
-+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
-+    rgb_buf += 8;
-+    width -= 2;
-+  }
-+
-+  if (width) {
-+    u = u_buf[x >> 17];
-+    v = v_buf[x >> 17];
-+    y = y_buf[x >> 16];
-+
-+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
-+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
-+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
-+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
-+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
-+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
-+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
-+  }
-+}
-+
-+static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
-+                                          const uint8* u_buf,
-+                                          const uint8* v_buf,
-+                                          uint8* rgb_buf,
-+                                          int width,
-+                                          int source_dx) {
-+  __m128i xmm0, xmmY1, xmmY2;
-+  __m128  xmmY;
-+  uint8 u0, u1, v0, v1, y0, y1;
-+  uint32 uv_frac, y_frac, u, v, y;
-+  int x = 0;
-+
-+  if (source_dx >= 0x20000) {
-+    x = 32768;
-+  }
-+
-+  while(width >= 2) {
-+    u0 = u_buf[x >> 17];
-+    u1 = u_buf[(x >> 17) + 1];
-+    v0 = v_buf[x >> 17];
-+    v1 = v_buf[(x >> 17) + 1];
-+    y0 = y_buf[x >> 16];
-+    y1 = y_buf[(x >> 16) + 1];
-+    uv_frac = (x & 0x1fffe);
-+    y_frac = (x & 0xffff);
-+    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
-+    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
-+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
-+    x += source_dx;
-+
-+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
-+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
-+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
-+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
-+
-+    y0 = y_buf[x >> 16];
-+    y1 = y_buf[(x >> 16) + 1];
-+    y_frac = (x & 0xffff);
-+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
-+    x += source_dx;
-+
-+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
-+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
-+
-+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
-+                          0x44);
-+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
-+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
-+
-+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
-+    rgb_buf += 8;
-+    width -= 2;
-+  }
-+
-+  if (width) {
-+    u = u_buf[x >> 17];
-+    v = v_buf[x >> 17];
-+    y = y_buf[x >> 16];
-+
-+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
-+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
-+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
-+
-+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
-+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
-+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
-+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
-+  }
-+}
-+
-+void FastConvertYUVToRGB32Row(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width) {
-+  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
-+}
-+
-+void ScaleYUVToRGB32Row(const uint8* y_buf,
-+                        const uint8* u_buf,
-+                        const uint8* v_buf,
-+                        uint8* rgb_buf,
-+                        int width,
-+                        int source_dx) {
-+  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
-+}
-+
-+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int source_dx) {
-+  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
-+                                source_dx);
-+}
-+
-+} // extern "C"