gfx/ycbcr/convert.patch
author Andrzej Hunt <andrzej@ahunt.org>
Tue, 15 Mar 2016 14:44:13 -0700
changeset 323712 876a1f819d83ef8035e579dd9247693c9526875b
parent 294042 69b07795c5fa8718d7119eb7c0319318edf66d7f
child 465224 8a463b11e2411cf11ce3113c00a3ef63f844f2d8
permissions -rw-r--r--
Bug 1254797 - Post: Ensure we ignore deleted sites in pinned query r=rnewman, a=ritu Pinned sites should be deleted directly, however I'm not confident enough in my knowledge of sync to be certain that we won't end up with deleted pinned sites in our table. (We use normal bookmark deletion for removing pinned sites.) MozReview-Commit-ID: SSLDkSXWlI

diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -6,145 +6,102 @@
 // http://www.fourcc.org/yuv.php
 // The actual conversion is best described here
 // http://en.wikipedia.org/wiki/YUV
 // An article on optimizing YUV conversion using tables instead of multiplies
 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
 //
 // YV12 is a full plane of Y and a half height, half width chroma planes
 // YV16 is a full plane of Y and a full height, half width chroma planes
+// YV24 is a full plane of Y and a full height, full width chroma planes
 //
 // ARGB pixel format is output, which on little endian is stored as BGRA.
 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
 
-#include "media/base/yuv_convert.h"
+#include "yuv_convert.h"
 
 // Header for low level row functions.
-#include "media/base/yuv_row.h"
-
-#if USE_MMX
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <mmintrin.h>
-#endif
-#endif
-
-#if USE_SSE2
-#include <emmintrin.h>
-#endif
-
-namespace media {
-
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+namespace mozilla {
+
+namespace gfx {
+ 
 // 16.16 fixed point arithmetic
 const int kFractionBits = 16;
 const int kFractionMax = 1 << kFractionBits;
 const int kFractionMask = ((1 << kFractionBits) - 1);
 
 // Convert a frame of YUV to 32 bit ARGB.
-void ConvertYUVToRGB32(const uint8* y_buf,
-                       const uint8* u_buf,
-                       const uint8* v_buf,
-                       uint8* rgb_buf,
-                       int width,
-                       int height,
-                       int y_pitch,
-                       int uv_pitch,
-                       int rgb_pitch,
-                       YUVType yuv_type) {
-  unsigned int y_shift = yuv_type;
-  for (int y = 0; y < height; ++y) {
-    uint8* rgb_row = rgb_buf + y * rgb_pitch;
-    const uint8* y_ptr = y_buf + y * y_pitch;
-    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
-    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
-
-    FastConvertYUVToRGB32Row(y_ptr,
-                             u_ptr,
-                             v_ptr,
-                             rgb_row,
-                             width);
-  }
+void ConvertYCbCrToRGB32(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int pic_x,
+                         int pic_y,
+                         int pic_width,
+                         int pic_height,
+                         int y_pitch,
+                         int uv_pitch,
+                         int rgb_pitch,
+                         YUVType yuv_type) {
+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
+  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
+  bool has_sse = supports_mmx() && supports_sse();
+  // There is no optimized YV24 SSE routine so we check for this and
+  // fall back to the C code.
+  has_sse &= yuv_type != YV24;
+  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
+  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
+
+  for (int y = pic_y; y < pic_height + pic_y; ++y) {
+    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
+    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
+    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+
+    if (odd_pic_x) {
+      // Handle the single odd pixel manually and use the
+      // fast routines for the remaining.
+      FastConvertYUVToRGB32Row_C(y_ptr++,
+                                 u_ptr++,
+                                 v_ptr++,
+                                 rgb_row,
+                                 1,
+                                 x_shift);
+      rgb_row += 4;
+    }
+
+    if (has_sse) {
+      FastConvertYUVToRGB32Row(y_ptr,
+                               u_ptr,
+                               v_ptr,
+                               rgb_row,
+                               x_width);
+    }
+    else {
+      FastConvertYUVToRGB32Row_C(y_ptr,
+                                 u_ptr,
+                                 v_ptr,
+                                 rgb_row,
+                                 x_width,
+                                 x_shift);
+    }
+  }
 
   // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
-  EMMS();
-}
-
-#if USE_SSE2
-// FilterRows combines two rows of the image using linear interpolation.
-// SSE2 version does 16 pixels at a time
-
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                       int source_width, int source_y_fraction) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
-  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
-
-  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
-  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
-  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
-  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
-
-  do {
-    __m128i y0 = _mm_loadu_si128(y0_ptr128);
-    __m128i y1 = _mm_loadu_si128(y1_ptr128);
-    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
-    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
-    y0 = _mm_unpacklo_epi8(y0, zero);
-    y1 = _mm_unpacklo_epi8(y1, zero);
-    y0 = _mm_mullo_epi16(y0, y0_fraction);
-    y1 = _mm_mullo_epi16(y1, y1_fraction);
-    y2 = _mm_mullo_epi16(y2, y0_fraction);
-    y3 = _mm_mullo_epi16(y3, y1_fraction);
-    y0 = _mm_add_epi16(y0, y1);
-    y2 = _mm_add_epi16(y2, y3);
-    y0 = _mm_srli_epi16(y0, 8);
-    y2 = _mm_srli_epi16(y2, 8);
-    y0 = _mm_packus_epi16(y0, y2);
-    *dest128++ = y0;
-    ++y0_ptr128;
-    ++y1_ptr128;
-  } while (dest128 < end128);
-}
-#elif USE_MMX
-// MMX version does 8 pixels at a time
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                       int source_width, int source_y_fraction) {
-  __m64 zero = _mm_setzero_si64();
-  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
-  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
-
-  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
-  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
-  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
-  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
-
-  do {
-    __m64 y0 = *y0_ptr64++;
-    __m64 y1 = *y1_ptr64++;
-    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
-    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
-    y0 = _mm_unpacklo_pi8(y0, zero);
-    y1 = _mm_unpacklo_pi8(y1, zero);
-    y0 = _mm_mullo_pi16(y0, y0_fraction);
-    y1 = _mm_mullo_pi16(y1, y1_fraction);
-    y2 = _mm_mullo_pi16(y2, y0_fraction);
-    y3 = _mm_mullo_pi16(y3, y1_fraction);
-    y0 = _mm_add_pi16(y0, y1);
-    y2 = _mm_add_pi16(y2, y3);
-    y0 = _mm_srli_pi16(y0, 8);
-    y2 = _mm_srli_pi16(y2, 8);
-    y0 = _mm_packs_pu16(y0, y2);
-    *dest64++ = y0;
-  } while (dest64 < end64);
-}
-#else  // no MMX or SSE2
+  if (has_sse)
+    EMMS();
+}
+
 // C version does 8 at a time to mimic MMX code
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                       int source_width, int source_y_fraction) {
+static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                         int source_width, int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   uint8* end = ybuf + source_width;
   do {
     ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
     ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
     ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
     ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
     ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
     ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
     ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
     y0_ptr += 8;
     y1_ptr += 8;
     ybuf += 8;
   } while (ybuf < end);
 }
-#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                    int source_width, int source_y_fraction);
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction);
+#endif
+
+static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
+                              const uint8* y1_ptr, int source_width,
+                              int source_y_fraction) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+  if (mozilla::supports_sse2()) {
+    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+    return;
+  }
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+  if (mozilla::supports_mmx()) {
+    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+    return;
+  }
+#endif
+
+  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+}
 
 
 // Scale a frame of YUV to 32 bit ARGB.
-void ScaleYUVToRGB32(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
-                     uint8* rgb_buf,
-                     int source_width,
-                     int source_height,
-                     int width,
-                     int height,
-                     int y_pitch,
-                     int uv_pitch,
-                     int rgb_pitch,
-                     YUVType yuv_type,
-                     Rotate view_rotate,
-                     ScaleFilter filter) {
+void ScaleYCbCrToRGB32(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int source_width,
+                       int source_height,
+                       int width,
+                       int height,
+                       int y_pitch,
+                       int uv_pitch,
+                       int rgb_pitch,
+                       YUVType yuv_type,
+                       Rotate view_rotate,
+                       ScaleFilter filter) {
+  bool has_mmx = supports_mmx();
+
   // 4096 allows 3 buffers to fit in 12k.
   // Helps performance on CPU with 16K L1 cache.
   // Large enough for 3830x2160 and 30" displays which are 2560x1600.
   const int kFilterBufferSize = 4096;
   // Disable filtering if the screen is too big (to avoid buffer overflows).
   // This should never happen to regular users: they don't have monitors
   // wider than 4096 pixels.
   // TODO(fbarchard): Allow rotated videos to filter.
   if (source_width > kFilterBufferSize || view_rotate)
     filter = FILTER_NONE;
 
-  unsigned int y_shift = yuv_type;
+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
   // Diagram showing origin and direction of source sampling.
   // ->0   4<-
   // 7       3
   //
   // 6       5
   // ->1   2<-
   // Rotations that start at right side of image.
   if ((view_rotate == ROTATE_180) ||
@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
     int source_uv_fraction =
         ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
 
     const uint8* y_ptr = y0_ptr;
     const uint8* u_ptr = u0_ptr;
     const uint8* v_ptr = v0_ptr;
     // Apply vertical filtering if necessary.
     // TODO(fbarchard): Remove memcpy when not necessary.
-    if (filter & media::FILTER_BILINEAR_V) {
+    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
       if (yscale_fixed != kFractionMax &&
           source_y_fraction && ((source_y + 1) < source_height)) {
         FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
       } else {
         memcpy(ybuf, y0_ptr, source_width);
       }
       y_ptr = ybuf;
       ybuf[source_width] = ybuf[source_width-1];
@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
       u_ptr = ubuf;
       v_ptr = vbuf;
       ubuf[uv_source_width] = ubuf[uv_source_width - 1];
       vbuf[uv_source_width] = vbuf[uv_source_width - 1];
     }
     if (source_dx == kFractionMax) {  // Not scaled
       FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                                dest_pixel, width);
-    } else {
-      if (filter & FILTER_BILINEAR_H) {
+    } else if (filter & FILTER_BILINEAR_H) {
         LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                                  dest_pixel, width, source_dx);
     } else {
 // Specialized scalers and rotation.
-#if USE_MMX && defined(_MSC_VER)
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
+      if(mozilla::supports_sse()) {
         if (width == (source_width * 2)) {
-          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                              dest_pixel, width);
+          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                  dest_pixel, width);
         } else if ((source_dx & kFractionMask) == 0) {
           // Scaling by integer scale factor. ie half.
-          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                               dest_pixel, width,
-                               source_dx >> kFractionBits);
+          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, width,
+                                   source_dx >> kFractionBits);
         } else if (source_dx_uv == source_dx) {  // Not rotated.
           ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                              dest_pixel, width, source_dx);
         } else {
-          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                                     dest_pixel, width,
-                                     source_dx >> kFractionBits,
-                                     source_dx_uv >> kFractionBits);
+          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                         dest_pixel, width,
+                                         source_dx >> kFractionBits,
+                                         source_dx_uv >> kFractionBits);
         }
+      }
+      else {
+        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                             dest_pixel, width, source_dx);
+      }
 #else
-        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                           dest_pixel, width, source_dx);
-#endif
-      }
+      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                         dest_pixel, width, source_dx);
+#endif
     }
   }
   // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
-  EMMS();
-}
-
-}  // namespace media
+  if (has_mmx)
+    EMMS();
+}
+
+}  // namespace gfx
+}  // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
--- a/gfx/ycbcr/yuv_convert.h
+++ b/gfx/ycbcr/yuv_convert.h
@@ -1,72 +1,79 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #ifndef MEDIA_BASE_YUV_CONVERT_H_
 #define MEDIA_BASE_YUV_CONVERT_H_
 
-#include "base/basictypes.h"
-
-namespace media {
-
+#include "chromium_types.h"
+#include "gfxCore.h"
+
+namespace mozilla {
+
+namespace gfx {
+ 
 // Type of YUV surface.
 // The value of these enums matter as they are used to shift vertical indices.
 enum YUVType {
-  YV16 = 0,           // YV16 is half width and full height chroma channels.
-  YV12 = 1,           // YV12 is half width and half height chroma channels.
+  YV12 = 0,           // YV12 is half width and half height chroma channels.
+  YV16 = 1,           // YV16 is half width and full height chroma channels.
+  YV24 = 2            // YV24 is full width and full height chroma channels.
 };
 
 // Mirror means flip the image horizontally, as in looking in a mirror.
 // Rotate happens after mirroring.
 enum Rotate {
   ROTATE_0,           // Rotation off.
   ROTATE_90,          // Rotate clockwise.
   ROTATE_180,         // Rotate upside down.
   ROTATE_270,         // Rotate counter clockwise.
   MIRROR_ROTATE_0,    // Mirror horizontally.
   MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
   MIRROR_ROTATE_180,  // Mirror vertically.
-  MIRROR_ROTATE_270,  // Transpose.
+  MIRROR_ROTATE_270   // Transpose.
 };
 
 // Filter affects how scaling looks.
 enum ScaleFilter {
   FILTER_NONE = 0,        // No filter (point sampled).
   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
-  FILTER_BILINEAR = 3,    // Bilinear filter.
+  FILTER_BILINEAR = 3     // Bilinear filter.
 };
 
 // Convert a frame of YUV to 32 bit ARGB.
 // Pass in YV16/YV12 depending on source format
-void ConvertYUVToRGB32(const uint8* yplane,
-                       const uint8* uplane,
-                       const uint8* vplane,
-                       uint8* rgbframe,
-                       int width,
-                       int height,
-                       int ystride,
-                       int uvstride,
-                       int rgbstride,
-                       YUVType yuv_type);
+void ConvertYCbCrToRGB32(const uint8* yplane,
+                         const uint8* uplane,
+                         const uint8* vplane,
+                         uint8* rgbframe,
+                         int pic_x,
+                         int pic_y,
+                         int pic_width,
+                         int pic_height,
+                         int ystride,
+                         int uvstride,
+                         int rgbstride,
+                         YUVType yuv_type);
 
 // Scale a frame of YUV to 32 bit ARGB.
 // Supports rotation and mirroring.
-void ScaleYUVToRGB32(const uint8* yplane,
-                     const uint8* uplane,
-                     const uint8* vplane,
-                     uint8* rgbframe,
-                     int source_width,
-                     int source_height,
-                     int width,
-                     int height,
-                     int ystride,
-                     int uvstride,
-                     int rgbstride,
-                     YUVType yuv_type,
-                     Rotate view_rotate,
-                     ScaleFilter filter);
-
-}  // namespace media
-
+void ScaleYCbCrToRGB32(const uint8* yplane,
+                       const uint8* uplane,
+                       const uint8* vplane,
+                       uint8* rgbframe,
+                       int source_width,
+                       int source_height,
+                       int width,
+                       int height,
+                       int ystride,
+                       int uvstride,
+                       int rgbstride,
+                       YUVType yuv_type,
+                       Rotate view_rotate,
+                       ScaleFilter filter);
+
+}  // namespace gfx
+}  // namespace mozilla
+ 
 #endif  // MEDIA_BASE_YUV_CONVERT_H_
diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <mmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// MMX version does 8 pixels at a time.
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                    int source_width, int source_y_fraction) {
+  __m64 zero = _mm_setzero_si64();
+  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+  do {
+    __m64 y0 = *y0_ptr64++;
+    __m64 y1 = *y1_ptr64++;
+    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+    y0 = _mm_unpacklo_pi8(y0, zero);
+    y1 = _mm_unpacklo_pi8(y1, zero);
+    y0 = _mm_mullo_pi16(y0, y0_fraction);
+    y1 = _mm_mullo_pi16(y1, y1_fraction);
+    y2 = _mm_mullo_pi16(y2, y0_fraction);
+    y3 = _mm_mullo_pi16(y3, y1_fraction);
+    y0 = _mm_add_pi16(y0, y1);
+    y2 = _mm_add_pi16(y2, y3);
+    y0 = _mm_srli_pi16(y0, 8);
+    y2 = _mm_srli_pi16(y2, 8);
+    y0 = _mm_packs_pu16(y0, y2);
+    *dest64++ = y0;
+  } while (dest64 < end64);
+}
+
+}
+}
diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <emmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// SSE2 version does 16 pixels at a time.
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+  do {
+    __m128i y0 = _mm_loadu_si128(y0_ptr128);
+    __m128i y1 = _mm_loadu_si128(y1_ptr128);
+    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+    y0 = _mm_unpacklo_epi8(y0, zero);
+    y1 = _mm_unpacklo_epi8(y1, zero);
+    y0 = _mm_mullo_epi16(y0, y0_fraction);
+    y1 = _mm_mullo_epi16(y1, y1_fraction);
+    y2 = _mm_mullo_epi16(y2, y0_fraction);
+    y3 = _mm_mullo_epi16(y3, y1_fraction);
+    y0 = _mm_add_epi16(y0, y1);
+    y2 = _mm_add_epi16(y2, y3);
+    y0 = _mm_srli_epi16(y0, 8);
+    y2 = _mm_srli_epi16(y2, 8);
+    y0 = _mm_packus_epi16(y0, y2);
+    *dest128++ = y0;
+    ++y0_ptr128;
+    ++y1_ptr128;
+  } while (dest128 < end128);
+}
+
+}
+}
diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
--- a/gfx/ycbcr/yuv_row.h
+++ b/gfx/ycbcr/yuv_row.h
@@ -5,109 +5,133 @@
 // yuv_row internal functions to handle YUV conversion and scaling to RGB.
 // These functions are used from both yuv_convert.cc and yuv_scale.cc.
 
 // TODO(fbarchard): Write function that can handle rotation and scaling.
 
 #ifndef MEDIA_BASE_YUV_ROW_H_
 #define MEDIA_BASE_YUV_ROW_H_
 
-#include "base/basictypes.h"
+#include "chromium_types.h"
 
 extern "C" {
 // Can only do 1x.
 // This is the second fastest of the scalers.
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width);
 
-// Can do 1x, half size or any scale down by an integer amount.
-// Step can be negative (mirroring, rotate 180).
-// This is the third fastest of the scalers.
-void ConvertYUVToRGB32Row(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width,
-                          int step);
-
-// Rotate is like Convert, but applies different step to Y versus U and V.
-// This allows rotation by 90 or 270, by stepping by stride.
-// This is the forth fastest of the scalers.
-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width,
-                                int ystep,
-                                int uvstep);
+                                unsigned int x_shift);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+
+// Can do 1x, half size or any scale down by an integer amount.
+// Step can be negative (mirroring, rotate 180).
+// This is the third fastest of the scalers.
+// Only defined on Windows x86-32.
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int step);
+
+// Rotate is like Convert, but applies different step to Y versus U and V.
+// This allows rotation by 90 or 270, by stepping by stride.
+// This is the forth fastest of the scalers.
+// Only defined on Windows x86-32.
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int ystep,
+                                    int uvstep);
 
 // Doubler does 4 pixels at a time.  Each pixel is replicated.
 // This is the fastest of the scalers.
-void DoubleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width);
+// Only defined on Windows x86-32.
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
 
 // Handles arbitrary scaling up or down.
 // Mirroring is supported, but not 90 or 270 degree rotation.
 // Chroma is under sampled every 2 pixels for performance.
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width,
                         int source_dx);
 
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int source_dx);
+
 // Handles arbitrary scaling up or down with bilinear filtering.
 // Mirroring is supported, but not 90 or 270 degree rotation.
 // Chroma is under sampled every 2 pixels for performance.
 // This is the slowest of the scalers.
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx);
 
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx);
+
+
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #endif
 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
 
-// Method to force C version.
-//#define USE_MMX 0
-//#define USE_SSE2 0
-
-#if !defined(USE_MMX)
-// Windows, Mac and Linux/BSD use MMX
-#if defined(__MMX__) || defined(_MSC_VER)
-#define USE_MMX 1
-#else
-#define USE_MMX 0
-#endif
-#endif
-
-#if !defined(USE_SSE2)
-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
-#define USE_SSE2 1
-#else
-#define USE_SSE2 0
-#endif
-#endif
-
 // x64 uses MMX2 (SSE) so emms is not required.
 // Warning C4799: function has no EMMS instruction.
 // EMMS() is slow and should be called by the calling function once per image.
-#if USE_MMX && !defined(ARCH_CPU_X86_64)
+#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
 #if defined(_MSC_VER)
 #define EMMS() __asm emms
 #pragma warning(disable: 4799)
 #else
 #define EMMS() asm("emms")
 #endif
 #else
 #define EMMS()
diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
--- a/gfx/ycbcr/yuv_row_c.cpp
+++ b/gfx/ycbcr/yuv_row_c.cpp
@@ -1,812 +1,18 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#include "media/base/yuv_row.h"
-
-#ifdef _DEBUG
-#include "base/logging.h"
-#else
+#include "yuv_row.h"
+
 #define DCHECK(a)
-#endif
 
 extern "C" {
 
-#if USE_SSE2 && defined(ARCH_CPU_X86_64)
-
-// AMD64 ABI uses register paremters.
-void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
-                              const uint8* u_buf,  // rsi
-                              const uint8* v_buf,  // rdx
-                              uint8* rgb_buf,      // rcx
-                              int width) {         // r8
-  asm(
-  "jmp    convertend\n"
-"convertloop:"
-  "movzb  (%1),%%r10\n"
-  "add    $0x1,%1\n"
-  "movzb  (%2),%%r11\n"
-  "add    $0x1,%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "add    $0x2,%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "add    $0x8,%3\n"
-"convertend:"
-  "sub    $0x2,%4\n"
-  "jns    convertloop\n"
-
-"convertnext:"
-  "add    $0x1,%4\n"
-  "js     convertdone\n"
-
-  "movzb  (%1),%%r10\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%2),%%r10\n"
-  "movq   4096(%5,%%r10,8),%%xmm1\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   (%5,%%r10,8),%%xmm1\n"
-  "paddsw %%xmm0,%%xmm1\n"
-  "psraw  $0x6,%%xmm1\n"
-  "packuswb %%xmm1,%%xmm1\n"
-  "movd   %%xmm1,0x0(%3)\n"
-"convertdone:"
-  :
-  : "r"(y_buf),  // %0
-    "r"(u_buf),  // %1
-    "r"(v_buf),  // %2
-    "r"(rgb_buf),  // %3
-    "r"(width),  // %4
-    "r" (kCoefficientsRgbY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
-                        const uint8* u_buf,  // rsi
-                        const uint8* v_buf,  // rdx
-                        uint8* rgb_buf,      // rcx
-                        int width,           // r8
-                        int source_dx) {     // r9
-  asm(
-  "xor    %%r11,%%r11\n"
-  "sub    $0x2,%4\n"
-  "js     scalenext\n"
-
-"scaleloop:"
-  "mov    %%r11,%%r10\n"
-  "sar    $0x11,%%r10\n"
-  "movzb  (%1,%%r10,1),%%rax\n"
-  "movq   2048(%5,%%rax,8),%%xmm0\n"
-  "movzb  (%2,%%r10,1),%%rax\n"
-  "movq   4096(%5,%%rax,8),%%xmm1\n"
-  "lea    (%%r11,%6),%%r10\n"
-  "sar    $0x10,%%r11\n"
-  "movzb  (%0,%%r11,1),%%rax\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%rax,8),%%xmm1\n"
-  "lea    (%%r10,%6),%%r11\n"
-  "sar    $0x10,%%r10\n"
-  "movzb  (%0,%%r10,1),%%rax\n"
-  "movq   (%5,%%rax,8),%%xmm2\n"
-  "paddsw %%xmm0,%%xmm1\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "shufps $0x44,%%xmm2,%%xmm1\n"
-  "psraw  $0x6,%%xmm1\n"
-  "packuswb %%xmm1,%%xmm1\n"
-  "movq   %%xmm1,0x0(%3)\n"
-  "add    $0x8,%3\n"
-  "sub    $0x2,%4\n"
-  "jns    scaleloop\n"
-
-"scalenext:"
-  "add    $0x1,%4\n"
-  "js     scaledone\n"
-
-  "mov    %%r11,%%r10\n"
-  "sar    $0x11,%%r10\n"
-  "movzb  (%1,%%r10,1),%%rax\n"
-  "movq   2048(%5,%%rax,8),%%xmm0\n"
-  "movzb  (%2,%%r10,1),%%rax\n"
-  "movq   4096(%5,%%rax,8),%%xmm1\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "sar    $0x10,%%r11\n"
-  "movzb  (%0,%%r11,1),%%rax\n"
-  "movq   (%5,%%rax,8),%%xmm1\n"
-  "paddsw %%xmm0,%%xmm1\n"
-  "psraw  $0x6,%%xmm1\n"
-  "packuswb %%xmm1,%%xmm1\n"
-  "movd   %%xmm1,0x0(%3)\n"
-
-"scaledone:"
-  :
-  : "r"(y_buf),  // %0
-    "r"(u_buf),  // %1
-    "r"(v_buf),  // %2
-    "r"(rgb_buf),  // %3
-    "r"(width),  // %4
-    "r" (kCoefficientsRgbY),  // %5
-    "r"(static_cast<long>(source_dx))  // %6
-  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
-);
-}
-
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx) {
-  asm(
-  "xor    %%r11,%%r11\n"   // x = 0
-  "sub    $0x2,%4\n"
-  "js     .lscalenext\n"
-  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
-  "jl     .lscalehalf\n"
-  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
-".lscalehalf:"
-
-".lscaleloop:"
-  "mov    %%r11,%%r10\n"
-  "sar    $0x11,%%r10\n"
-
-  "movzb  (%1, %%r10, 1), %%r13 \n"
-  "movzb  1(%1, %%r10, 1), %%r14 \n"
-  "mov    %%r11, %%rax \n"
-  "and    $0x1fffe, %%rax \n"
-  "imul   %%rax, %%r14 \n"
-  "xor    $0x1fffe, %%rax \n"
-  "imul   %%rax, %%r13 \n"
-  "add    %%r14, %%r13 \n"
-  "shr    $17, %%r13 \n"
-  "movq   2048(%5,%%r13,8), %%xmm0\n"
-
-  "movzb  (%2, %%r10, 1), %%r13 \n"
-  "movzb  1(%2, %%r10, 1), %%r14 \n"
-  "mov    %%r11, %%rax \n"
-  "and    $0x1fffe, %%rax \n"
-  "imul   %%rax, %%r14 \n"
-  "xor    $0x1fffe, %%rax \n"
-  "imul   %%rax, %%r13 \n"
-  "add    %%r14, %%r13 \n"
-  "shr    $17, %%r13 \n"
-  "movq   4096(%5,%%r13,8), %%xmm1\n"
-
-  "mov    %%r11, %%rax \n"
-  "lea    (%%r11,%6),%%r10\n"
-  "sar    $0x10,%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-
-  "movzb  (%0, %%r11, 1), %%r13 \n"
-  "movzb  1(%0, %%r11, 1), %%r14 \n"
-  "and    $0xffff, %%rax \n"
-  "imul   %%rax, %%r14 \n"
-  "xor    $0xffff, %%rax \n"
-  "imul   %%rax, %%r13 \n"
-  "add    %%r14, %%r13 \n"
-  "shr    $16, %%r13 \n"
-  "movq   (%5,%%r13,8),%%xmm1\n"
-
-  "mov    %%r10, %%rax \n"
-  "lea    (%%r10,%6),%%r11\n"
-  "sar    $0x10,%%r10\n"
-
-  "movzb  (%0,%%r10,1), %%r13 \n"
-  "movzb  1(%0,%%r10,1), %%r14 \n"
-  "and    $0xffff, %%rax \n"
-  "imul   %%rax, %%r14 \n"
-  "xor    $0xffff, %%rax \n"
-  "imul   %%rax, %%r13 \n"
-  "add    %%r14, %%r13 \n"
-  "shr    $16, %%r13 \n"
-  "movq   (%5,%%r13,8),%%xmm2\n"
-
-  "paddsw %%xmm0,%%xmm1\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "shufps $0x44,%%xmm2,%%xmm1\n"
-  "psraw  $0x6,%%xmm1\n"
-  "packuswb %%xmm1,%%xmm1\n"
-  "movq   %%xmm1,0x0(%3)\n"
-  "add    $0x8,%3\n"
-  "sub    $0x2,%4\n"
-  "jns    .lscaleloop\n"
-
-".lscalenext:"
-  "add    $0x1,%4\n"
-  "js     .lscaledone\n"
-
-  "mov    %%r11,%%r10\n"
-  "sar    $0x11,%%r10\n"
-
-  "movzb  (%1,%%r10,1), %%r13 \n"
-  "movq   2048(%5,%%r13,8),%%xmm0\n"
-
-  "movzb  (%2,%%r10,1), %%r13 \n"
-  "movq   4096(%5,%%r13,8),%%xmm1\n"
-
-  "paddsw %%xmm1,%%xmm0\n"
-  "sar    $0x10,%%r11\n"
-
-  "movzb  (%0,%%r11,1), %%r13 \n"
-  "movq   (%5,%%r13,8),%%xmm1\n"
-
-  "paddsw %%xmm0,%%xmm1\n"
-  "psraw  $0x6,%%xmm1\n"
-  "packuswb %%xmm1,%%xmm1\n"
-  "movd   %%xmm1,0x0(%3)\n"
-
-".lscaledone:"
-  :
-  : "r"(y_buf),  // %0
-    "r"(u_buf),  // %1
-    "r"(v_buf),  // %2
-    "r"(rgb_buf),  // %3
-    "r"(width),  // %4
-    "r" (kCoefficientsRgbY),  // %5
-    "r"(static_cast<long>(source_dx))  // %6
-  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
-);
-}
-
-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
-
-// PIC version is slower because less registers are available, so
-// non-PIC is used on platforms where it is possible.
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-  ".global FastConvertYUVToRGB32Row\n"
-"FastConvertYUVToRGB32Row:\n"
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-  "jmp    convertend\n"
-
-"convertloop:"
-  "movzbl (%edi),%eax\n"
-  "add    $0x1,%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "add    $0x1,%esi\n"
-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "add    $0x2,%edx\n"
-  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "add    $0x8,%ebp\n"
-"convertend:"
-  "sub    $0x2,%ecx\n"
-  "jns    convertloop\n"
-
-  "and    $0x1,%ecx\n"
-  "je     convertdone\n"
-
-  "movzbl (%edi),%eax\n"
-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%esi),%eax\n"
-  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "paddsw %mm0,%mm1\n"
-  "psraw  $0x6,%mm1\n"
-  "packuswb %mm1,%mm1\n"
-  "movd   %mm1,0x0(%ebp)\n"
-"convertdone:"
-  "popa\n"
-  "ret\n"
-);
-
-
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx);
-  asm(
-  ".text\n"
-  ".global ScaleYUVToRGB32Row\n"
-"ScaleYUVToRGB32Row:\n"
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-  "xor    %ebx,%ebx\n"
-  "jmp    scaleend\n"
-
-"scaleloop:"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%edi,%eax,1),%eax\n"
-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%esi,%eax,1),%eax\n"
-  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%eax\n"
-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%eax\n"
-  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "add    $0x8,%ebp\n"
-"scaleend:"
-  "sub    $0x2,%ecx\n"
-  "jns    scaleloop\n"
-
-  "and    $0x1,%ecx\n"
-  "je     scaledone\n"
-
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%edi,%eax,1),%eax\n"
-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%esi,%eax,1),%eax\n"
-  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%eax\n"
-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "paddsw %mm0,%mm1\n"
-  "psraw  $0x6,%mm1\n"
-  "packuswb %mm1,%mm1\n"
-  "movd   %mm1,0x0(%ebp)\n"
-
-"scaledone:"
-  "popa\n"
-  "ret\n"
-);
-
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx);
-  asm(
-  ".text\n"
-  ".global LinearScaleYUVToRGB32Row\n"
-"LinearScaleYUVToRGB32Row:\n"
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x30(%esp),%ebp\n"
-
-  // source_width = width * source_dx + ebx
-  "mov    0x34(%esp), %ecx\n"
-  "imull  0x38(%esp), %ecx\n"
-  "mov    %ecx, 0x34(%esp)\n"
-
-  "mov    0x38(%esp), %ecx\n"
-  "xor    %ebx,%ebx\n"     // x = 0
-  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
-  "jl     .lscaleend\n"
-  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
-  "jmp    .lscaleend\n"
-
-".lscaleloop:"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-
-  "movzbl (%edi,%eax,1),%ecx\n"
-  "movzbl 1(%edi,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "andl   $0x1fffe, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0x1fffe, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $17, %ecx \n"
-  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
-
-  "mov    0x2c(%esp),%esi\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-
-  "movzbl (%esi,%eax,1),%ecx\n"
-  "movzbl 1(%esi,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "andl   $0x1fffe, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0x1fffe, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $17, %ecx \n"
-  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
-
-  "mov    %ebx,%eax\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%ecx\n"
-  "movzbl 1(%edx,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "andl   $0xffff, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0xffff, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $16, %ecx \n"
-  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
-
-  "cmp    0x34(%esp), %ebx\n"
-  "jge    .lscalelastpixel\n"
-
-  "mov    %ebx,%eax\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%ecx\n"
-  "movzbl 1(%edx,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "andl   $0xffff, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0xffff, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $16, %ecx \n"
-  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
-
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "add    $0x8,%ebp\n"
-
-".lscaleend:"
-  "cmp    0x34(%esp), %ebx\n"
-  "jl     .lscaleloop\n"
-  "popa\n"
-  "ret\n"
-
-".lscalelastpixel:"
-  "paddsw %mm0, %mm1\n"
-  "psraw $6, %mm1\n"
-  "packuswb %mm1, %mm1\n"
-  "movd %mm1, (%ebp)\n"
-  "popa\n"
-  "ret\n"
-);
-
-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
-
-extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width,
-                                    int16 *kCoefficientsRgbY);
-  asm(
-  ".text\n"
-#if defined(OS_MACOSX)
-"_PICConvertYUVToRGB32Row:\n"
-#else
-"PICConvertYUVToRGB32Row:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x38(%esp),%ecx\n"
-
-  "jmp    .Lconvertend\n"
-
-".Lconvertloop:"
-  "movzbl (%edi),%eax\n"
-  "add    $0x1,%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "add    $0x1,%esi\n"
-  "movq   2048(%ecx,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   0(%ecx,%eax,8),%mm1\n"
-  "add    $0x2,%edx\n"
-  "movq   0(%ecx,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "add    $0x8,%ebp\n"
-".Lconvertend:"
-  "subl   $0x2,0x34(%esp)\n"
-  "jns    .Lconvertloop\n"
-
-  "andl   $0x1,0x34(%esp)\n"
-  "je     .Lconvertdone\n"
-
-  "movzbl (%edi),%eax\n"
-  "movq   2048(%ecx,%eax,8),%mm0\n"
-  "movzbl (%esi),%eax\n"
-  "paddsw 4096(%ecx,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "movq   0(%ecx,%eax,8),%mm1\n"
-  "paddsw %mm0,%mm1\n"
-  "psraw  $0x6,%mm1\n"
-  "packuswb %mm1,%mm1\n"
-  "movd   %mm1,0x0(%ebp)\n"
-".Lconvertdone:\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
-  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
-                          &kCoefficientsRgbY[0][0]);
-}
-
-extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width,
-                               int source_dx,
-                               int16 *kCoefficientsRgbY);
-
-  asm(
-  ".text\n"
-#if defined(OS_MACOSX)
-"_PICScaleYUVToRGB32Row:\n"
-#else
-"PICScaleYUVToRGB32Row:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x3c(%esp),%ecx\n"
-  "xor    %ebx,%ebx\n"
-  "jmp    Lscaleend\n"
-
-"Lscaleloop:"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%edi,%eax,1),%eax\n"
-  "movq   2048(%ecx,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%esi,%eax,1),%eax\n"
-  "paddsw 4096(%ecx,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%eax\n"
-  "movq   0(%ecx,%eax,8),%mm1\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%eax\n"
-  "movq   0(%ecx,%eax,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "add    $0x8,%ebp\n"
-"Lscaleend:"
-  "subl   $0x2,0x34(%esp)\n"
-  "jns    Lscaleloop\n"
-
-  "andl   $0x1,0x34(%esp)\n"
-  "je     Lscaledone\n"
-
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%edi,%eax,1),%eax\n"
-  "movq   2048(%ecx,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-  "movzbl (%esi,%eax,1),%eax\n"
-  "paddsw 4096(%ecx,%eax,8),%mm0\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%eax\n"
-  "movq   0(%ecx,%eax,8),%mm1\n"
-  "paddsw %mm0,%mm1\n"
-  "psraw  $0x6,%mm1\n"
-  "packuswb %mm1,%mm1\n"
-  "movd   %mm1,0x0(%ebp)\n"
-
-"Lscaledone:"
-  "popa\n"
-  "ret\n"
-);
-
-
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx) {
-  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                        &kCoefficientsRgbY[0][0]);
-}
-
-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width,
-                                 int source_dx,
-                                 int16 *kCoefficientsRgbY);
-  asm(
-  ".text\n"
-#if defined(OS_MACOSX)
-"_PICLinearScaleYUVToRGB32Row:\n"
-#else
-"PICLinearScaleYUVToRGB32Row:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-  "mov    0x3c(%esp),%edi\n"
-  "xor    %ebx,%ebx\n"
-
-  // source_width = width * source_dx + ebx
-  "mov    0x34(%esp), %ecx\n"
-  "imull  0x38(%esp), %ecx\n"
-  "mov    %ecx, 0x34(%esp)\n"
-
-  "mov    0x38(%esp), %ecx\n"
-  "xor    %ebx,%ebx\n"     // x = 0
-  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
-  "jl     .lscaleend\n"
-  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
-  "jmp    .lscaleend\n"
-
-".lscaleloop:"
-  "mov    0x28(%esp),%esi\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-
-  "movzbl (%esi,%eax,1),%ecx\n"
-  "movzbl 1(%esi,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "andl   $0x1fffe, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0x1fffe, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $17, %ecx \n"
-  "movq   2048(%edi,%ecx,8),%mm0\n"
-
-  "mov    0x2c(%esp),%esi\n"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
-
-  "movzbl (%esi,%eax,1),%ecx\n"
-  "movzbl 1(%esi,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "andl   $0x1fffe, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0x1fffe, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $17, %ecx \n"
-  "paddsw 4096(%edi,%ecx,8),%mm0\n"
-
-  "mov    %ebx,%eax\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%ecx\n"
-  "movzbl 1(%edx,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "andl   $0xffff, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0xffff, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $16, %ecx \n"
-  "movq   (%edi,%ecx,8),%mm1\n"
-
-  "cmp    0x34(%esp), %ebx\n"
-  "jge    .lscalelastpixel\n"
-
-  "mov    %ebx,%eax\n"
-  "sar    $0x10,%eax\n"
-  "movzbl (%edx,%eax,1),%ecx\n"
-  "movzbl 1(%edx,%eax,1),%esi\n"
-  "mov    %ebx,%eax\n"
-  "add    0x38(%esp),%ebx\n"
-  "andl   $0xffff, %eax \n"
-  "imul   %eax, %esi \n"
-  "xorl   $0xffff, %eax \n"
-  "imul   %eax, %ecx \n"
-  "addl   %esi, %ecx \n"
-  "shrl   $16, %ecx \n"
-  "movq   (%edi,%ecx,8),%mm2\n"
-
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "add    $0x8,%ebp\n"
-
-".lscaleend:"
-  "cmp    %ebx, 0x34(%esp)\n"
-  "jg     .lscaleloop\n"
-  "popa\n"
-  "ret\n"
-
-".lscalelastpixel:"
-  "paddsw %mm0, %mm1\n"
-  "psraw $6, %mm1\n"
-  "packuswb %mm1, %mm1\n"
-  "movd %mm1, (%ebp)\n"
-  "popa\n"
-  "ret\n"
-);
-
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx) {
-  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                              &kCoefficientsRgbY[0][0]);
-}
-
-#else  // USE_MMX
-
 // C reference code that mimic the YUV assembly.
 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
     (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
 
 static inline void YuvPixel(uint8 y,
                             uint8 u,
                             uint8 v,
@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
   a >>= 6;
 
   *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
                                         (packuswb(g) << 8) |
                                         (packuswb(r) << 16) |
                                         (packuswb(a) << 24);
 }
 
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                unsigned int x_shift) {
   for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
+    uint8 u = u_buf[x >> x_shift];
+    uint8 v = v_buf[x >> x_shift];
     uint8 y0 = y_buf[x];
     YuvPixel(y0, u, v, rgb_buf);
     if ((x + 1) < width) {
       uint8 y1 = y_buf[x + 1];
+      if (x_shift == 0) {
+        u = u_buf[x + 1];
+        v = v_buf[x + 1];
+      }
       YuvPixel(y1, u, v, rgb_buf + 4);
     }
     rgb_buf += 8;  // Advance 2 pixels.
   }
 }
 
 // 16.16 fixed point is used.  A shift by 16 isolates the integer.
 // A shift by 17 is used to further subsample the chrominence channels.
 // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
 // for 1/65536 pixel accurate interpolation.
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx) {
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int source_dx) {
   int x = 0;
   for (int i = 0; i < width; i += 2) {
     int y = y_buf[x >> 16];
     int u = u_buf[(x >> 17)];
     int v = v_buf[(x >> 17)];
     YuvPixel(y, u, v, rgb_buf);
     x += source_dx;
     if ((i + 1) < width) {
       y = y_buf[x >> 16];
       YuvPixel(y, u, v, rgb_buf+4);
       x += source_dx;
     }
     rgb_buf += 8;
   }
 }
 
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx) {
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx) {
   int x = 0;
   if (source_dx >= 0x20000) {
     x = 32768;
   }
   for (int i = 0; i < width; i += 2) {
     int y0 = y_buf[x >> 16];
     int y1 = y_buf[(x >> 16) + 1];
     int u0 = u_buf[(x >> 17)];
@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
       y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
       YuvPixel(y, u, v, rgb_buf+4);
       x += source_dx;
     }
     rgb_buf += 8;
   }
 }
 
-#endif  // USE_MMX
 }  // extern "C"
 
diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
--- a/gfx/ycbcr/yuv_row_posix.cpp
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -1,33 +1,32 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#include "media/base/yuv_row.h"
-
-#ifdef _DEBUG
-#include "base/logging.h"
-#else
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
 #define DCHECK(a)
-#endif
 
 extern "C" {
 
-#if USE_SSE2 && defined(ARCH_CPU_X86_64)
+#if defined(ARCH_CPU_X86_64)
+
+// We don't need CPUID guards here, since x86-64 implies SSE2.
 
 // AMD64 ABI uses register paremters.
 void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
                               const uint8* u_buf,  // rsi
                               const uint8* v_buf,  // rdx
                               uint8* rgb_buf,      // rcx
                               int width) {         // r8
   asm(
-  "jmp    convertend\n"
-"convertloop:"
+  "jmp    1f\n"
+"0:"
   "movzb  (%1),%%r10\n"
   "add    $0x1,%1\n"
   "movzb  (%2),%%r11\n"
   "add    $0x1,%2\n"
   "movq   2048(%5,%%r10,8),%%xmm0\n"
   "movzb  (%0),%%r10\n"
   "movq   4096(%5,%%r11,8),%%xmm1\n"
   "movzb  0x1(%0),%%r11\n"
@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
   "movq   (%5,%%r11,8),%%xmm3\n"
   "paddsw %%xmm0,%%xmm2\n"
   "paddsw %%xmm0,%%xmm3\n"
   "shufps $0x44,%%xmm3,%%xmm2\n"
   "psraw  $0x6,%%xmm2\n"
   "packuswb %%xmm2,%%xmm2\n"
   "movq   %%xmm2,0x0(%3)\n"
   "add    $0x8,%3\n"
-"convertend:"
+"1:"
   "sub    $0x2,%4\n"
-  "jns    convertloop\n"
-
-"convertnext:"
+  "jns    0b\n"
+
+"2:"
   "add    $0x1,%4\n"
-  "js     convertdone\n"
+  "js     3f\n"
 
   "movzb  (%1),%%r10\n"
   "movq   2048(%5,%%r10,8),%%xmm0\n"
   "movzb  (%2),%%r10\n"
   "movq   4096(%5,%%r10,8),%%xmm1\n"
   "paddsw %%xmm1,%%xmm0\n"
   "movzb  (%0),%%r10\n"
   "movq   (%5,%%r10,8),%%xmm1\n"
   "paddsw %%xmm0,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movd   %%xmm1,0x0(%3)\n"
-"convertdone:"
+"3:"
   :
   : "r"(y_buf),  // %0
     "r"(u_buf),  // %1
     "r"(v_buf),  // %2
     "r"(rgb_buf),  // %3
     "r"(width),  // %4
     "r" (kCoefficientsRgbY)  // %5
   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
                         const uint8* u_buf,  // rsi
                         const uint8* v_buf,  // rdx
                         uint8* rgb_buf,      // rcx
                         int width,           // r8
                         int source_dx) {     // r9
   asm(
   "xor    %%r11,%%r11\n"
   "sub    $0x2,%4\n"
-  "js     scalenext\n"
-
-"scaleloop:"
+  "js     1f\n"
+
+"0:"
   "mov    %%r11,%%r10\n"
   "sar    $0x11,%%r10\n"
   "movzb  (%1,%%r10,1),%%rax\n"
   "movq   2048(%5,%%rax,8),%%xmm0\n"
   "movzb  (%2,%%r10,1),%%rax\n"
   "movq   4096(%5,%%rax,8),%%xmm1\n"
   "lea    (%%r11,%6),%%r10\n"
   "sar    $0x10,%%r11\n"
@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
   "paddsw %%xmm0,%%xmm1\n"
   "paddsw %%xmm0,%%xmm2\n"
   "shufps $0x44,%%xmm2,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movq   %%xmm1,0x0(%3)\n"
   "add    $0x8,%3\n"
   "sub    $0x2,%4\n"
-  "jns    scaleloop\n"
-
-"scalenext:"
+  "jns    0b\n"
+
+"1:"
   "add    $0x1,%4\n"
-  "js     scaledone\n"
+  "js     2f\n"
 
   "mov    %%r11,%%r10\n"
   "sar    $0x11,%%r10\n"
   "movzb  (%1,%%r10,1),%%rax\n"
   "movq   2048(%5,%%rax,8),%%xmm0\n"
   "movzb  (%2,%%r10,1),%%rax\n"
   "movq   4096(%5,%%rax,8),%%xmm1\n"
   "paddsw %%xmm1,%%xmm0\n"
   "sar    $0x10,%%r11\n"
   "movzb  (%0,%%r11,1),%%rax\n"
   "movq   (%5,%%rax,8),%%xmm1\n"
   "paddsw %%xmm0,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movd   %%xmm1,0x0(%3)\n"
 
-"scaledone:"
+"2:"
   :
   : "r"(y_buf),  // %0
     "r"(u_buf),  // %1
     "r"(v_buf),  // %2
     "r"(rgb_buf),  // %3
     "r"(width),  // %4
     "r" (kCoefficientsRgbY),  // %5
     "r"(static_cast<long>(source_dx))  // %6
@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx) {
   asm(
   "xor    %%r11,%%r11\n"   // x = 0
   "sub    $0x2,%4\n"
-  "js     .lscalenext\n"
+  "js     2f\n"
   "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
-  "jl     .lscalehalf\n"
+  "jl     0f\n"
   "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
-".lscalehalf:"
-
-".lscaleloop:"
+"0:"
+
+"1:"
   "mov    %%r11,%%r10\n"
   "sar    $0x11,%%r10\n"
 
   "movzb  (%1, %%r10, 1), %%r13 \n"
   "movzb  1(%1, %%r10, 1), %%r14 \n"
   "mov    %%r11, %%rax \n"
   "and    $0x1fffe, %%rax \n"
   "imul   %%rax, %%r14 \n"
@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
   "paddsw %%xmm0,%%xmm1\n"
   "paddsw %%xmm0,%%xmm2\n"
   "shufps $0x44,%%xmm2,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movq   %%xmm1,0x0(%3)\n"
   "add    $0x8,%3\n"
   "sub    $0x2,%4\n"
-  "jns    .lscaleloop\n"
-
-".lscalenext:"
+  "jns    1b\n"
+
+"2:"
   "add    $0x1,%4\n"
-  "js     .lscaledone\n"
+  "js     3f\n"
 
   "mov    %%r11,%%r10\n"
   "sar    $0x11,%%r10\n"
 
   "movzb  (%1,%%r10,1), %%r13 \n"
   "movq   2048(%5,%%r13,8),%%xmm0\n"
 
   "movzb  (%2,%%r10,1), %%r13 \n"
@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
   "movzb  (%0,%%r11,1), %%r13 \n"
   "movq   (%5,%%r13,8),%%xmm1\n"
 
   "paddsw %%xmm0,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movd   %%xmm1,0x0(%3)\n"
 
-".lscaledone:"
+"3:"
   :
   : "r"(y_buf),  // %0
     "r"(u_buf),  // %1
     "r"(v_buf),  // %2
     "r"(rgb_buf),  // %3
     "r"(width),  // %4
     "r" (kCoefficientsRgbY),  // %5
     "r"(static_cast<long>(source_dx))  // %6
   : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
 );
 }
 
-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
 
 // PIC version is slower because less registers are available, so
 // non-PIC is used on platforms where it is possible.
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
   asm(
   ".text\n"
-  ".global FastConvertYUVToRGB32Row\n"
-"FastConvertYUVToRGB32Row:\n"
+  ".global FastConvertYUVToRGB32Row_SSE\n"
+  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
+"FastConvertYUVToRGB32Row_SSE:\n"
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x34(%esp),%ecx\n"
-  "jmp    convertend\n"
-
-"convertloop:"
+  "jmp    1f\n"
+
+"0:"
   "movzbl (%edi),%eax\n"
   "add    $0x1,%edi\n"
   "movzbl (%esi),%ebx\n"
   "add    $0x1,%esi\n"
   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   "movzbl (%edx),%eax\n"
   "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
   "movzbl 0x1(%edx),%ebx\n"
@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
   "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
   "paddsw %mm0,%mm1\n"
   "paddsw %mm0,%mm2\n"
   "psraw  $0x6,%mm1\n"
   "psraw  $0x6,%mm2\n"
   "packuswb %mm2,%mm1\n"
   "movntq %mm1,0x0(%ebp)\n"
   "add    $0x8,%ebp\n"
-"convertend:"
+"1:"
   "sub    $0x2,%ecx\n"
-  "jns    convertloop\n"
+  "jns    0b\n"
 
   "and    $0x1,%ecx\n"
-  "je     convertdone\n"
+  "je     2f\n"
 
   "movzbl (%edi),%eax\n"
   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   "movzbl (%esi),%eax\n"
   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
   "movzbl (%edx),%eax\n"
   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
   "paddsw %mm0,%mm1\n"
   "psraw  $0x6,%mm1\n"
   "packuswb %mm1,%mm1\n"
   "movd   %mm1,0x0(%ebp)\n"
-"convertdone:"
+"2:"
   "popa\n"
   "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
 );
 
-
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx);
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width)
+{
+  if (mozilla::supports_sse()) {
+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+    return;
+  }
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx);
   asm(
   ".text\n"
-  ".global ScaleYUVToRGB32Row\n"
-"ScaleYUVToRGB32Row:\n"
+  ".global ScaleYUVToRGB32Row_SSE\n"
+  ".type ScaleYUVToRGB32Row_SSE, @function\n"
+"ScaleYUVToRGB32Row_SSE:\n"
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x34(%esp),%ecx\n"
   "xor    %ebx,%ebx\n"
-  "jmp    scaleend\n"
-
-"scaleloop:"
+  "jmp    1f\n"
+
+"0:"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%edi,%eax,1),%eax\n"
   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%esi,%eax,1),%eax\n"
   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
   "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
   "paddsw %mm0,%mm1\n"
   "paddsw %mm0,%mm2\n"
   "psraw  $0x6,%mm1\n"
   "psraw  $0x6,%mm2\n"
   "packuswb %mm2,%mm1\n"
   "movntq %mm1,0x0(%ebp)\n"
   "add    $0x8,%ebp\n"
-"scaleend:"
+"1:"
   "sub    $0x2,%ecx\n"
-  "jns    scaleloop\n"
+  "jns    0b\n"
 
   "and    $0x1,%ecx\n"
-  "je     scaledone\n"
+  "je     2f\n"
 
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%edi,%eax,1),%eax\n"
   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%esi,%eax,1),%eax\n"
@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
   "sar    $0x10,%eax\n"
   "movzbl (%edx,%eax,1),%eax\n"
   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
   "paddsw %mm0,%mm1\n"
   "psraw  $0x6,%mm1\n"
   "packuswb %mm1,%mm1\n"
   "movd   %mm1,0x0(%ebp)\n"
 
-"scaledone:"
+"2:"
   "popa\n"
   "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
 );
 
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx);
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+                           width, source_dx);
+  }
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+                       width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx);
   asm(
   ".text\n"
-  ".global LinearScaleYUVToRGB32Row\n"
-"LinearScaleYUVToRGB32Row:\n"
+  ".global LinearScaleYUVToRGB32Row_SSE\n"
+  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
+"LinearScaleYUVToRGB32Row_SSE:\n"
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x30(%esp),%ebp\n"
 
   // source_width = width * source_dx + ebx
   "mov    0x34(%esp), %ecx\n"
   "imull  0x38(%esp), %ecx\n"
   "mov    %ecx, 0x34(%esp)\n"
 
   "mov    0x38(%esp), %ecx\n"
   "xor    %ebx,%ebx\n"     // x = 0
   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
-  "jl     .lscaleend\n"
+  "jl     1f\n"
   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
-  "jmp    .lscaleend\n"
-
-".lscaleloop:"
-  "mov    %ebx,%eax\n"
-  "sar    $0x11,%eax\n"
+  "jmp    1f\n"
+
+"0:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
 
   "movzbl (%edi,%eax,1),%ecx\n"
   "movzbl 1(%edi,%eax,1),%esi\n"
   "mov    %ebx,%eax\n"
   "andl   $0x1fffe, %eax \n"
   "imul   %eax, %esi \n"
   "xorl   $0x1fffe, %eax \n"
   "imul   %eax, %ecx \n"
@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
   "imul   %eax, %esi \n"
   "xorl   $0xffff, %eax \n"
   "imul   %eax, %ecx \n"
   "addl   %esi, %ecx \n"
   "shrl   $16, %ecx \n"
   "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
 
   "cmp    0x34(%esp), %ebx\n"
-  "jge    .lscalelastpixel\n"
+  "jge    2f\n"
 
   "mov    %ebx,%eax\n"
   "sar    $0x10,%eax\n"
   "movzbl (%edx,%eax,1),%ecx\n"
   "movzbl 1(%edx,%eax,1),%esi\n"
   "mov    %ebx,%eax\n"
   "add    0x38(%esp),%ebx\n"
   "andl   $0xffff, %eax \n"
@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
   "paddsw %mm0,%mm1\n"
   "paddsw %mm0,%mm2\n"
   "psraw  $0x6,%mm1\n"
   "psraw  $0x6,%mm2\n"
   "packuswb %mm2,%mm1\n"
   "movntq %mm1,0x0(%ebp)\n"
   "add    $0x8,%ebp\n"
 
-".lscaleend:"
+"1:"
   "cmp    0x34(%esp), %ebx\n"
-  "jl     .lscaleloop\n"
+  "jl     0b\n"
   "popa\n"
   "ret\n"
 
-".lscalelastpixel:"
+"2:"
   "paddsw %mm0, %mm1\n"
   "psraw $6, %mm1\n"
   "packuswb %mm1, %mm1\n"
   "movd %mm1, (%ebp)\n"
   "popa\n"
   "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
 );
 
-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
-
-extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width,
-                                    int16 *kCoefficientsRgbY);
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+                                 width, source_dx);
+  }
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+                             width, source_dx);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
+
+void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+                                 int16 *kCoefficientsRgbY);
+
   asm(
   ".text\n"
-#if defined(OS_MACOSX)
-"_PICConvertYUVToRGB32Row:\n"
+#if defined(XP_MACOSX)
+"_PICConvertYUVToRGB32Row_SSE:\n"
 #else
-"PICConvertYUVToRGB32Row:\n"
+"PICConvertYUVToRGB32Row_SSE:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x38(%esp),%ecx\n"
 
-  "jmp    .Lconvertend\n"
-
-".Lconvertloop:"
+  "jmp    1f\n"
+
+"0:"
   "movzbl (%edi),%eax\n"
   "add    $0x1,%edi\n"
   "movzbl (%esi),%ebx\n"
   "add    $0x1,%esi\n"
   "movq   2048(%ecx,%eax,8),%mm0\n"
   "movzbl (%edx),%eax\n"
   "paddsw 4096(%ecx,%ebx,8),%mm0\n"
   "movzbl 0x1(%edx),%ebx\n"
@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
   "movq   0(%ecx,%ebx,8),%mm2\n"
   "paddsw %mm0,%mm1\n"
   "paddsw %mm0,%mm2\n"
   "psraw  $0x6,%mm1\n"
   "psraw  $0x6,%mm2\n"
   "packuswb %mm2,%mm1\n"
   "movntq %mm1,0x0(%ebp)\n"
   "add    $0x8,%ebp\n"
-".Lconvertend:"
+"1:"
   "subl   $0x2,0x34(%esp)\n"
-  "jns    .Lconvertloop\n"
+  "jns    0b\n"
 
   "andl   $0x1,0x34(%esp)\n"
-  "je     .Lconvertdone\n"
+  "je     2f\n"
 
   "movzbl (%edi),%eax\n"
   "movq   2048(%ecx,%eax,8),%mm0\n"
   "movzbl (%esi),%eax\n"
   "paddsw 4096(%ecx,%eax,8),%mm0\n"
   "movzbl (%edx),%eax\n"
   "movq   0(%ecx,%eax,8),%mm1\n"
   "paddsw %mm0,%mm1\n"
   "psraw  $0x6,%mm1\n"
   "packuswb %mm1,%mm1\n"
   "movd   %mm1,0x0(%ebp)\n"
-".Lconvertdone:\n"
+"2:"
   "popa\n"
   "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
 );
 
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
-                              int width) {
-  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
-                          &kCoefficientsRgbY[0][0]);
-}
-
-extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
+                              int width)
+{
+  if (mozilla::supports_sse()) {
+    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width,
                                int source_dx,
                                int16 *kCoefficientsRgbY);
 
   asm(
   ".text\n"
-#if defined(OS_MACOSX)
-"_PICScaleYUVToRGB32Row:\n"
+#if defined(XP_MACOSX)
+"_PICScaleYUVToRGB32Row_SSE:\n"
 #else
-"PICScaleYUVToRGB32Row:\n"
+"PICScaleYUVToRGB32Row_SSE:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x3c(%esp),%ecx\n"
   "xor    %ebx,%ebx\n"
-  "jmp    Lscaleend\n"
-
-"Lscaleloop:"
+  "jmp    1f\n"
+
+"0:"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%edi,%eax,1),%eax\n"
   "movq   2048(%ecx,%eax,8),%mm0\n"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%esi,%eax,1),%eax\n"
   "paddsw 4096(%ecx,%eax,8),%mm0\n"
@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const 
   "movq   0(%ecx,%eax,8),%mm2\n"
   "paddsw %mm0,%mm1\n"
   "paddsw %mm0,%mm2\n"
   "psraw  $0x6,%mm1\n"
   "psraw  $0x6,%mm2\n"
   "packuswb %mm2,%mm1\n"
   "movntq %mm1,0x0(%ebp)\n"
   "add    $0x8,%ebp\n"
-"Lscaleend:"
+"1:"
   "subl   $0x2,0x34(%esp)\n"
-  "jns    Lscaleloop\n"
+  "jns    0b\n"
 
   "andl   $0x1,0x34(%esp)\n"
-  "je     Lscaledone\n"
+  "je     2f\n"
 
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%edi,%eax,1),%eax\n"
   "movq   2048(%ecx,%eax,8),%mm0\n"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
   "movzbl (%esi,%eax,1),%eax\n"
@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const 
   "sar    $0x10,%eax\n"
   "movzbl (%edx,%eax,1),%eax\n"
   "movq   0(%ecx,%eax,8),%mm1\n"
   "paddsw %mm0,%mm1\n"
   "psraw  $0x6,%mm1\n"
   "packuswb %mm1,%mm1\n"
   "movd   %mm1,0x0(%ebp)\n"
 
-"Lscaledone:"
+"2:"
   "popa\n"
   "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
 );
 
-
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width,
-                        int source_dx) {
-  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                        &kCoefficientsRgbY[0][0]);
-}
-
-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width,
-                                 int source_dx,
-                                 int16 *kCoefficientsRgbY);
+                        int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+                              &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     uint8* rgb_buf,
+                                     int width,
+                                     int source_dx,
+                                     int16 *kCoefficientsRgbY);
+
   asm(
   ".text\n"
-#if defined(OS_MACOSX)
-"_PICLinearScaleYUVToRGB32Row:\n"
+#if defined(XP_MACOSX)
+"_PICLinearScaleYUVToRGB32Row_SSE:\n"
 #else
-"PICLinearScaleYUVToRGB32Row:\n"
+"PICLinearScaleYUVToRGB32Row_SSE:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x34(%esp),%ecx\n"
   "mov    0x3c(%esp),%edi\n"
   "xor    %ebx,%ebx\n"
 
   // source_width = width * source_dx + ebx
   "mov    0x34(%esp), %ecx\n"
   "imull  0x38(%esp), %ecx\n"
   "mov    %ecx, 0x34(%esp)\n"
 
   "mov    0x38(%esp), %ecx\n"
   "xor    %ebx,%ebx\n"     // x = 0
   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
-  "jl     .lscaleend\n"
+  "jl     1f\n"
   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
-  "jmp    .lscaleend\n"
-
-".lscaleloop:"
+  "jmp    1f\n"
+
+"0:"
   "mov    0x28(%esp),%esi\n"
   "mov    %ebx,%eax\n"
   "sar    $0x11,%eax\n"
 
   "movzbl (%esi,%eax,1),%ecx\n"
   "movzbl 1(%esi,%eax,1),%esi\n"
   "mov    %ebx,%eax\n"
   "andl   $0x1fffe, %eax \n"
@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
   "imul   %eax, %esi \n"
   "xorl   $0xffff, %eax \n"
   "imul   %eax, %ecx \n"
   "addl   %esi, %ecx \n"
   "shrl   $16, %ecx \n"
   "movq   (%edi,%ecx,8),%mm1\n"
 
   "cmp    0x34(%esp), %ebx\n"
-  "jge    .lscalelastpixel\n"
+  "jge    2f\n"
 
   "mov    %ebx,%eax\n"
   "sar    $0x10,%eax\n"
   "movzbl (%edx,%eax,1),%ecx\n"
   "movzbl 1(%edx,%eax,1),%esi\n"
   "mov    %ebx,%eax\n"
   "add    0x38(%esp),%ebx\n"
   "andl   $0xffff, %eax \n"
@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
   "paddsw %mm0,%mm1\n"
   "paddsw %mm0,%mm2\n"
   "psraw  $0x6,%mm1\n"
   "psraw  $0x6,%mm2\n"
   "packuswb %mm2,%mm1\n"
   "movntq %mm1,0x0(%ebp)\n"
   "add    $0x8,%ebp\n"
 
-".lscaleend:"
+"1:"
   "cmp    %ebx, 0x34(%esp)\n"
-  "jg     .lscaleloop\n"
+  "jg     0b\n"
   "popa\n"
   "ret\n"
 
-".lscalelastpixel:"
+"2:"
   "paddsw %mm0, %mm1\n"
   "psraw $6, %mm1\n"
   "packuswb %mm1, %mm1\n"
   "movd %mm1, (%ebp)\n"
   "popa\n"
   "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
 );
 
+
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx) {
-  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                              &kCoefficientsRgbY[0][0]);
-}
-
-#else  // USE_MMX
-
-// C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
-
-static inline void YuvPixel(uint8 y,
-                            uint8 u,
-                            uint8 v,
-                            uint8* rgb_buf) {
-
-  int b = kCoefficientsRgbY[256+u][0];
-  int g = kCoefficientsRgbY[256+u][1];
-  int r = kCoefficientsRgbY[256+u][2];
-  int a = kCoefficientsRgbY[256+u][3];
-
-  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
-  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
-  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
-  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
-
-  b = paddsw(b, kCoefficientsRgbY[y][0]);
-  g = paddsw(g, kCoefficientsRgbY[y][1]);
-  r = paddsw(r, kCoefficientsRgbY[y][2]);
-  a = paddsw(a, kCoefficientsRgbY[y][3]);
-
-  b >>= 6;
-  g >>= 6;
-  r >>= 6;
-  a >>= 6;
-
-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
-                                        (packuswb(g) << 8) |
-                                        (packuswb(r) << 16) |
-                                        (packuswb(a) << 24);
-}
-
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                    source_dx, &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#else
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-}
-
-// 16.16 fixed point is used.  A shift by 16 isolates the integer.
-// A shift by 17 is used to further subsample the chrominence channels.
-// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
-// for 1/65536 pixel accurate interpolation.
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width,
                         int source_dx) {
-  int x = 0;
-  for (int i = 0; i < width; i += 2) {
-    int y = y_buf[x >> 16];
-    int u = u_buf[(x >> 17)];
-    int v = v_buf[(x >> 17)];
-    YuvPixel(y, u, v, rgb_buf);
-    x += source_dx;
-    if ((i + 1) < width) {
-      y = y_buf[x >> 16];
-      YuvPixel(y, u, v, rgb_buf+4);
-      x += source_dx;
-    }
-    rgb_buf += 8;
-  }
-}
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
 
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx) {
-  int x = 0;
-  if (source_dx >= 0x20000) {
-    x = 32768;
-  }
-  for (int i = 0; i < width; i += 2) {
-    int y0 = y_buf[x >> 16];
-    int y1 = y_buf[(x >> 16) + 1];
-    int u0 = u_buf[(x >> 17)];
-    int u1 = u_buf[(x >> 17) + 1];
-    int v0 = v_buf[(x >> 17)];
-    int v1 = v_buf[(x >> 17) + 1];
-    int y_frac = (x & 65535);
-    int uv_frac = ((x >> 1) & 65535);
-    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
-    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
-    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
-    YuvPixel(y, u, v, rgb_buf);
-    x += source_dx;
-    if ((i + 1) < width) {
-      y0 = y_buf[x >> 16];
-      y1 = y_buf[(x >> 16) + 1];
-      y_frac = (x & 65535);
-      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
-      YuvPixel(y, u, v, rgb_buf+4);
-      x += source_dx;
-    }
-    rgb_buf += 8;
-  }
-}
-
-#endif  // USE_MMX
-}  // extern "C"
-
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#endif
+
+}
diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
--- a/gfx/ycbcr/yuv_row_table.cpp
+++ b/gfx/ycbcr/yuv_row_table.cpp
@@ -1,13 +1,13 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#include "media/base/yuv_row.h"
+#include "yuv_row.h"
 
 extern "C" {
 
 #define RGBY(i) { \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
   0 \
diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
--- a/gfx/ycbcr/yuv_row_win.cpp
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -1,26 +1,27 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#include "media/base/yuv_row.h"
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
 
 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
 
 extern "C" {
 
-#if USE_MMX
-__declspec(naked)
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+__declspec(naked)
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     jmp       convertend
@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
  convertdone :
 
     popad
     ret
   }
 }
 
 __declspec(naked)
-void ConvertYUVToRGB32Row(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width,
-                          int step) {
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int step) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     mov       ebx, [esp + 32 + 24]  // step
@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
  wdone :
 
     popad
     ret
   }
 }
 
 __declspec(naked)
-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* rgb_buf,
-                                int width,
-                                int ystep,
-                                int uvstep) {
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int ystep,
+                                    int uvstep) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     jmp       wend
@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
  wdone :
 
     popad
     ret
   }
 }
 
 __declspec(naked)
-void DoubleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width) {
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     jmp       wend
@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
     jns       wloop1
  wdone :
     popad
     ret
   }
 }
 
 // This version does general purpose scaling by any amount, up or down.
-// The only thing it can not do it rotation by 90 or 270.
-// For performance the chroma is under sampled, reducing cost of a 3x
+// The only thing it cannot do is rotation by 90 or 270.
+// For performance the chroma is under-sampled, reducing cost of a 3x
 // 1080p scale from 8.4 ms to 5.4 ms.
 __declspec(naked)
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx) {
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     xor       ebx, ebx              // x
@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
 
  scaledone :
     popad
     ret
   }
 }
 
 __declspec(naked)
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx) {
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]  // Y
     mov       edi, [esp + 32 + 8]  // U
                 // [esp + 32 + 12] // V
     mov       ebp, [esp + 32 + 16] // rgb
     mov       ecx, [esp + 32 + 20] // width
     imul      ecx, [esp + 32 + 24] // source_dx
@@ -438,152 +439,60 @@ lscalelastpixel:
     paddsw    mm1, mm0
     psraw     mm1, 6
     packuswb  mm1, mm1
     movd      [ebp], mm1
     popad
     ret
   };
 }
-#else  // USE_MMX
-
-// C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
-
-static inline void YuvPixel(uint8 y,
-                            uint8 u,
-                            uint8 v,
-                            uint8* rgb_buf) {
-
-  int b = kCoefficientsRgbY[256+u][0];
-  int g = kCoefficientsRgbY[256+u][1];
-  int r = kCoefficientsRgbY[256+u][2];
-  int a = kCoefficientsRgbY[256+u][3];
-
-  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
-  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
-  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
-  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
-
-  b = paddsw(b, kCoefficientsRgbY[y][0]);
-  g = paddsw(g, kCoefficientsRgbY[y][1]);
-  r = paddsw(r, kCoefficientsRgbY[y][2]);
-  a = paddsw(a, kCoefficientsRgbY[y][3]);
-
-  b >>= 6;
-  g >>= 6;
-  r >>= 6;
-  a >>= 6;
-
-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
-                                        (packuswb(g) << 8) |
-                                        (packuswb(r) << 16) |
-                                        (packuswb(a) << 24);
-}
-
-#if TEST_MMX_YUV
-static inline void YuvPixel(uint8 y,
-                            uint8 u,
-                            uint8 v,
-                            uint8* rgb_buf) {
-
-  __asm {
-    movzx     eax, u
-    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]
-    movzx     eax, v
-    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]
-    movzx     eax, y
-    movq      mm1, [kCoefficientsRgbY + 8 * eax]
-    paddsw    mm1, mm0
-    psraw     mm1, 6
-    packuswb  mm1, mm1
-    mov       eax, rgb_buf
-    movd      [eax], mm1
-    emms
-  }
-}
-#endif
+#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
 
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-}
-
-// 16.16 fixed point is used.  A shift by 16 isolates the integer.
-// A shift by 17 is used to further subsample the chrominence channels.
-// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
-// for 1/65536 pixel accurate interpolation.
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+    return;
+  }
+#endif
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width,
                         int source_dx) {
-  int x = 0;
-  for (int i = 0; i < width; i += 2) {
-    int y = y_buf[x >> 16];
-    int u = u_buf[(x >> 17)];
-    int v = v_buf[(x >> 17)];
-    YuvPixel(y, u, v, rgb_buf);
-    x += source_dx;
-    if ((i + 1) < width) {
-      y = y_buf[x >> 16];
-      YuvPixel(y, u, v, rgb_buf+4);
-      x += source_dx;
-    }
-    rgb_buf += 8;
-  }
-}
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+    return;
+  }
+#endif
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
 
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx) {
-  int x = 0;
-  if (source_dx >= 0x20000) {
-    x = 32768;
-  }
-  for (int i = 0; i < width; i += 2) {
-    int y0 = y_buf[x >> 16];
-    int y1 = y_buf[(x >> 16) + 1];
-    int u0 = u_buf[(x >> 17)];
-    int u1 = u_buf[(x >> 17) + 1];
-    int v0 = v_buf[(x >> 17)];
-    int v1 = v_buf[(x >> 17) + 1];
-    int y_frac = (x & 65535);
-    int uv_frac = ((x >> 1) & 65535);
-    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
-    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
-    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
-    YuvPixel(y, u, v, rgb_buf);
-    x += source_dx;
-    if ((i + 1) < width) {
-      y0 = y_buf[x >> 16];
-      y1 = y_buf[(x >> 16) + 1];
-      y_frac = (x & 65535);
-      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
-      YuvPixel(y, u, v, rgb_buf+4);
-      x += source_dx;
-    }
-    rgb_buf += 8;
-  }
-}
-
-#endif  // USE_MMX
-}  // extern "C"
-
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                 source_dx);
+    return;
+  }
+#endif
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+} // extern "C"