Bug 616778 - Part 2: Update NPOTB files in gfx/ycbcr. r=kinetik, a=NPOTB
authorJustin Lebar <justin.lebar@gmail.com>
Wed, 12 Jan 2011 19:08:17 -0800
changeset 60429 ab3e03c79004e2925c2868a412e525ecdd2f342e
parent 60428 d580ec700a1190dd833b059874c4f9b3374ac064
child 60430 de5d3839b7e50c02d6dfd8408da20b2245349f68
push idunknown
push userunknown
push dateunknown
reviewerskinetik, NPOTB
bugs616778
milestone2.0b10pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 616778 - Part 2: Update NPOTB files in gfx/ycbcr. r=kinetik, a=NPOTB
gfx/ycbcr/README
gfx/ycbcr/arm.patch
gfx/ycbcr/convert.patch
gfx/ycbcr/update.sh
--- a/gfx/ycbcr/README
+++ b/gfx/ycbcr/README
@@ -1,17 +1,20 @@
 This color conversion code is from the Chromium open source project available here:
 
 http://code.google.com/chromium/
 
-The code comes from svn revision 638400 on 2010-10-26.
+The code comes from svn revision 63840 on 2010-10-26.
 
 The code was copied from a Chromium svn checkout using the 'update.sh' script which then applies patches for our build and to add dynamic CPU detection.
 
-convert.patch: Change Chromium code to build using Mozilla build system.
-               Add runtime CPU detection for MMX
-               Move default C implementation to work on all platforms.
-               Change Chromium code to allow a picture region.
-               The YUV conversion will convert within this 
-               picture region only.
-               Add YCbCr 4:4:4 support
+convert.patch contains the following changes:
 
-arm.patch: Add YCbCr to rgb16_565 conversion support, bug 616469
+  * Change Chromium code to build using Mozilla build system.
+  * Add runtime CPU detection for MMX
+  * Move default C implementation to work on all platforms.
+  * Change Chromium code to allow a picture region.
+  * The YUV conversion will convert within this picture region only.
+  * Add YCbCr 4:4:4 support
+  * Bug 616469 - Add YCbCr to rgb16_565 conversion support.
+  * Bug 619178 - Update CPU detection in yuv_convert to new SSE.h interface.
+  * Bug 616778 - Split yuv_convert FilterRows vectorized code into separate files so it can
+    be properly guarded with cpuid() calls.
deleted file mode 100644
--- a/gfx/ycbcr/arm.patch
+++ /dev/null
@@ -1,347 +0,0 @@
-diff --git a/gfx/ycbcr/Makefile.in b/gfx/ycbcr/Makefile.in
---- a/gfx/ycbcr/Makefile.in
-+++ b/gfx/ycbcr/Makefile.in
-@@ -40,16 +40,21 @@ CPPSRCS += yuv_row_posix.cpp \
- else
- CPPSRCS += yuv_row_other.cpp \
-            $(NULL)
- endif # Darwin
- endif # SunOS
- endif # linux
- endif # windows
- 
-+ifeq (arm,$(findstring arm,$(OS_TEST)))
-+CPPSRCS += yuv_convert_arm.cpp \
-+           $(NULL)
-+endif
-+
- EXTRA_DSO_LDOPTS += \
-         $(LIBS_DIR) \
-         $(EXTRA_DSO_LIBS) \
-         $(XPCOM_LIBS) \
-         $(NSPR_LIBS) \
-         $(NULL)
- 
- include $(topsrcdir)/config/rules.mk
-diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
---- a/gfx/ycbcr/yuv_convert.cpp
-+++ b/gfx/ycbcr/yuv_convert.cpp
-@@ -19,25 +19,56 @@
- #include "yuv_convert.h"
- 
- // Header for low level row functions.
- #include "yuv_row.h"
- #define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
- #define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
- #include "mozilla/SSE.h"
- 
-+#ifdef HAVE_YCBCR_TO_RGB565
-+void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
-+#endif
-+
- namespace mozilla {
- 
- namespace gfx {
-  
- // 16.16 fixed point arithmetic
- const int kFractionBits = 16;
- const int kFractionMax = 1 << kFractionBits;
- const int kFractionMask = ((1 << kFractionBits) - 1);
- 
-+
-+// Convert a frame of YUV to 16 bit RGB565.
-+NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
-+                                  const uint8* u_buf,
-+                                  const uint8* v_buf,
-+                                  uint8* rgb_buf,
-+                                  int pic_x,
-+                                  int pic_y,
-+                                  int pic_width,
-+                                  int pic_height,
-+                                  int y_pitch,
-+                                  int uv_pitch,
-+                                  int rgb_pitch,
-+                                  YUVType yuv_type)
-+{
-+#ifdef HAVE_YCBCR_TO_RGB565
-+  for (int i = 0; i < pic_height; i++) {
-+    yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
-+                         y_buf + y_pitch * i,
-+                         u_buf + uv_pitch * (i / 2),
-+                         v_buf + uv_pitch * (i / 2),
-+                         pic_width,
-+                         0);
-+  }
-+#endif
-+}
-+
- // Convert a frame of YUV to 32 bit ARGB.
- NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* rgb_buf,
-                                   int pic_x,
-                                   int pic_y,
-                                   int pic_width,
-diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
---- a/gfx/ycbcr/yuv_convert.h
-+++ b/gfx/ycbcr/yuv_convert.h
-@@ -2,16 +2,20 @@
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- 
- #ifndef MEDIA_BASE_YUV_CONVERT_H_
- #define MEDIA_BASE_YUV_CONVERT_H_
- 
- #include "chromium_types.h"
- #include "gfxCore.h"
-+
-+#ifdef __arm__
-+#define HAVE_YCBCR_TO_RGB565 1
-+#endif
-  
- namespace mozilla {
- 
- namespace gfx {
-  
- // Type of YUV surface.
- // The value of these enums matter as they are used to shift vertical indices.
- enum YUVType {
-@@ -36,16 +40,31 @@ enum Rotate {
- // Filter affects how scaling looks.
- enum ScaleFilter {
-   FILTER_NONE = 0,        // No filter (point sampled).
-   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
-   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
-   FILTER_BILINEAR = 3     // Bilinear filter.
- };
- 
-+// Convert a frame of YUV to 16 bit RGB565.
-+// Pass in YV12 formats
-+NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
-+                                  const uint8* uplane,
-+                                  const uint8* vplane,
-+                                  uint8* rgbframe,
-+                                  int pic_x,
-+                                  int pic_y,
-+                                  int pic_width,
-+                                  int pic_height,
-+                                  int ystride,
-+                                  int uvstride,
-+                                  int rgbstride,
-+                                  YUVType yuv_type);
-+
- // Convert a frame of YUV to 32 bit ARGB.
- // Pass in YV16/YV12 depending on source format
- NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
-                                   const uint8* uplane,
-                                   const uint8* vplane,
-                                   uint8* rgbframe,
-                                   int pic_x,
-                                   int pic_y,
-diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp
-new file mode 100644
---- /dev/null
-+++ b/gfx/ycbcr/yuv_convert_arm.cpp
-@@ -0,0 +1,201 @@
-+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-+// Use of this source code is governed by a BSD-style license that can be
-+// found in the LICENSE file.
-+
-+// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
-+
-+#include "yuv_convert.h"
-+
-+void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag)
-+{
-+    static __attribute__((aligned(16))) uint16 acc_r[8] = {
-+        22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
-+    };
-+    static __attribute__((aligned(16))) uint16 acc_g[8] = {
-+        17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
-+    };
-+    static __attribute__((aligned(16))) uint16 acc_b[8] = {
-+        28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
-+    };
-+    /*
-+     * Registers:
-+     * q0, q1 : d0, d1, d2, d3  - are used for initial loading of YUV data
-+     * q2     : d4, d5          - are used for storing converted RGB data
-+     * q3     : d6, d7          - are used for temporary storage
-+     *
-+     * q4-q7 - reserved
-+     *
-+     * q8, q9 : d16, d17, d18, d19  - are used for expanded Y data
-+     * q10    : d20, d21
-+     * q11    : d22, d23
-+     * q12    : d24, d25
-+     * q13    : d26, d27
-+     * q13, q14, q15            - various constants (#16, #149, #204, #50, #104, #154)
-+     */
-+    asm volatile (
-+".fpu neon\n"
-+".macro convert_macroblock size\n"
-+/* load up to 16 source pixels */
-+	".if \\size == 16\n"
-+	    "pld [%[y], #64]\n"
-+	    "pld [%[u], #64]\n"
-+	    "pld [%[v], #64]\n"
-+	    "vld1.8 {d1}, [%[y]]!\n"
-+	    "vld1.8 {d3}, [%[y]]!\n"
-+	    "vld1.8 {d0}, [%[u]]!\n"
-+	    "vld1.8 {d2}, [%[v]]!\n"
-+	".elseif \\size == 8\n"
-+	    "vld1.8 {d1}, [%[y]]!\n"
-+	    "vld1.8 {d0[0]}, [%[u]]!\n"
-+	    "vld1.8 {d0[1]}, [%[u]]!\n"
-+	    "vld1.8 {d0[2]}, [%[u]]!\n"
-+	    "vld1.8 {d0[3]}, [%[u]]!\n"
-+	    "vld1.8 {d2[0]}, [%[v]]!\n"
-+	    "vld1.8 {d2[1]}, [%[v]]!\n"
-+	    "vld1.8 {d2[2]}, [%[v]]!\n"
-+	    "vld1.8 {d2[3]}, [%[v]]!\n"
-+	".elseif \\size == 4\n"
-+	    "vld1.8 {d1[0]}, [%[y]]!\n"
-+	    "vld1.8 {d1[1]}, [%[y]]!\n"
-+	    "vld1.8 {d1[2]}, [%[y]]!\n"
-+	    "vld1.8 {d1[3]}, [%[y]]!\n"
-+	    "vld1.8 {d0[0]}, [%[u]]!\n"
-+	    "vld1.8 {d0[1]}, [%[u]]!\n"
-+	    "vld1.8 {d2[0]}, [%[v]]!\n"
-+	    "vld1.8 {d2[1]}, [%[v]]!\n"
-+	".elseif \\size == 2\n"
-+	    "vld1.8 {d1[0]}, [%[y]]!\n"
-+	    "vld1.8 {d1[1]}, [%[y]]!\n"
-+	    "vld1.8 {d0[0]}, [%[u]]!\n"
-+	    "vld1.8 {d2[0]}, [%[v]]!\n"
-+	".elseif \\size == 1\n"
-+	    "vld1.8 {d1[0]}, [%[y]]!\n"
-+	    "vld1.8 {d0[0]}, [%[u]]!\n"
-+	    "vld1.8 {d2[0]}, [%[v]]!\n"
-+	".else\n"
-+	    ".error \"unsupported macroblock size\"\n"
-+	".endif\n"
-+
-+        /* d1 - Y data (first 8 bytes) */
-+        /* d3 - Y data (next 8 bytes) */
-+        /* d0 - U data, d2 - V data */
-+
-+	/* split even and odd Y color components */
-+	"vuzp.8      d1, d3\n"                       /* d1 - evenY, d3 - oddY */
-+	/* clip upper and lower boundaries */
-+	"vqadd.u8    q0, q0, q4\n"
-+	"vqadd.u8    q1, q1, q4\n"
-+	"vqsub.u8    q0, q0, q5\n"
-+	"vqsub.u8    q1, q1, q5\n"
-+
-+	"vshr.u8     d4, d2, #1\n"                   /* d4 = V >> 1 */
-+
-+	"vmull.u8    q8, d1, d27\n"                  /* q8 = evenY * 149 */
-+	"vmull.u8    q9, d3, d27\n"                  /* q9 = oddY * 149 */
-+
-+	"vld1.16     {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
-+	"vsubw.u8    q10, q10, d4\n"                 /* red acc -= (V >> 1) */
-+	"vmlsl.u8    q10, d2, d28\n"                 /* red acc -= V * 204 */
-+	"vld1.16     {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
-+	"vmlsl.u8    q11, d2, d30\n"                 /* green acc -= V * 104 */
-+	"vmlsl.u8    q11, d0, d29\n"                 /* green acc -= U * 50 */
-+	"vld1.16     {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
-+	"vmlsl.u8    q12, d0, d30\n"                 /* blue acc -= U * 104 */
-+	"vmlsl.u8    q12, d0, d31\n"                 /* blue acc -= U * 154 */
-+
-+	"vhsub.s16   q3, q8, q10\n"                  /* calculate even red components */
-+	"vhsub.s16   q10, q9, q10\n"                 /* calculate odd red components */
-+	"vqshrun.s16 d0, q3, #6\n"                   /* right shift, narrow and saturate even red components */
-+	"vqshrun.s16 d3, q10, #6\n"                  /* right shift, narrow and saturate odd red components */
-+
-+	"vhadd.s16   q3, q8, q11\n"                  /* calculate even green components */
-+	"vhadd.s16   q11, q9, q11\n"                 /* calculate odd green components */
-+	"vqshrun.s16 d1, q3, #6\n"                   /* right shift, narrow and saturate even green components */
-+	"vqshrun.s16 d4, q11, #6\n"                  /* right shift, narrow and saturate odd green components */
-+
-+	"vhsub.s16   q3, q8, q12\n"                  /* calculate even blue components */
-+	"vhsub.s16   q12, q9, q12\n"                 /* calculate odd blue components */
-+	"vqshrun.s16 d2, q3, #6\n"                   /* right shift, narrow and saturate even blue components */
-+	"vqshrun.s16 d5, q12, #6\n"                  /* right shift, narrow and saturate odd blue components */
-+
-+	"vzip.8      d0, d3\n"                       /* join even and odd red components */
-+	"vzip.8      d1, d4\n"                       /* join even and odd green components */
-+	"vzip.8      d2, d5\n"                       /* join even and odd blue components */
-+
-+	"vshll.u8    q3, d0, #8\n\t"
-+	"vshll.u8    q8, d1, #8\n\t"
-+	"vshll.u8    q9, d2, #8\n\t"
-+	"vsri.u16    q3, q8, #5\t\n"
-+	"vsri.u16    q3, q9, #11\t\n"
-+	/* store pixel data to memory */
-+	".if \\size == 16\n"
-+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
-+	"    vshll.u8    q3, d3, #8\n\t"
-+	"    vshll.u8    q8, d4, #8\n\t"
-+	"    vshll.u8    q9, d5, #8\n\t"
-+	"    vsri.u16    q3, q8, #5\t\n"
-+	"    vsri.u16    q3, q9, #11\t\n"
-+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
-+	".elseif \\size == 8\n"
-+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
-+	".elseif \\size == 4\n"
-+	"    vst1.16 {d6}, [%[dst]]!\n"
-+	".elseif \\size == 2\n"
-+	"    vst1.16 {d6[0]}, [%[dst]]!\n"
-+	"    vst1.16 {d6[1]}, [%[dst]]!\n"
-+	".elseif \\size == 1\n"
-+	"    vst1.16 {d6[0]}, [%[dst]]!\n"
-+	".endif\n"
-+	".endm\n"
-+
-+	"vmov.u8     d8, #15\n" /* add this to U/V to saturate upper boundary */
-+	"vmov.u8     d9, #20\n" /* add this to Y to saturate upper boundary */
-+	"vmov.u8     d10, #31\n" /* sub this from U/V to saturate lower boundary */
-+	"vmov.u8     d11, #36\n" /* sub this from Y to saturate lower boundary */
-+
-+	"vmov.u8     d26, #16\n"
-+	"vmov.u8     d27, #149\n"
-+	"vmov.u8     d28, #204\n"
-+	"vmov.u8     d29, #50\n"
-+	"vmov.u8     d30, #104\n"
-+	"vmov.u8     d31, #154\n"
-+
-+	"cmp         %[oddflag], #0\n"
-+	"beq         1f\n"
-+	"convert_macroblock 1\n"
-+	"sub         %[n], %[n], #1\n"
-+    "1:\n"
-+	"subs        %[n], %[n], #16\n"
-+	"blt         2f\n"
-+    "1:\n"
-+	"convert_macroblock 16\n"
-+	"subs        %[n], %[n], #16\n"
-+	"bge         1b\n"
-+    "2:\n"
-+	"tst         %[n], #8\n"
-+	"beq         3f\n"
-+	"convert_macroblock 8\n"
-+    "3:\n"
-+	"tst         %[n], #4\n"
-+	"beq         4f\n"
-+	"convert_macroblock 4\n"
-+    "4:\n"
-+	"tst         %[n], #2\n"
-+	"beq         5f\n"
-+	"convert_macroblock 2\n"
-+    "5:\n"
-+	"tst         %[n], #1\n"
-+	"beq         6f\n"
-+	"convert_macroblock 1\n"
-+    "6:\n"
-+	".purgem convert_macroblock\n"
-+	: [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
-+	: [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
-+	  [oddflag] "r" (oddflag)
-+	: "cc", "memory",
-+	  "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
-+	  "d8",  "d9",  "d10", "d11", /* "d12", "d13", "d14", "d15", */
-+	  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
-+	  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
-+    );
-+}
--- a/gfx/ycbcr/convert.patch
+++ b/gfx/ycbcr/convert.patch
@@ -1,13 +1,12 @@
-diff --git b/gfx/ycbcr/yuv_convert.cpp a/gfx/ycbcr/yuv_convert.cpp
-index bea0e50..ab4f10a 100644
---- b/gfx/ycbcr/yuv_convert.cpp
-+++ a/gfx/ycbcr/yuv_convert.cpp
-@@ -6,77 +6,104 @@
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -6,145 +6,133 @@
  // http://www.fourcc.org/yuv.php
  // The actual conversion is best described here
  // http://en.wikipedia.org/wiki/YUV
  // An article on optimizing YUV conversion using tables instead of multiplies
  // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
  //
  // YV12 is a full plane of Y and a half height, half width chroma planes
  // YV16 is a full plane of Y and a full height, half width chroma planes
@@ -16,43 +15,74 @@ index bea0e50..ab4f10a 100644
  // ARGB pixel format is output, which on little endian is stored as BGRA.
  // The alpha is set to 255, allowing the application to use RGBA or RGB32.
  
 -#include "media/base/yuv_convert.h"
 +#include "yuv_convert.h"
  
  // Header for low level row functions.
 -#include "media/base/yuv_row.h"
-+#include "yuv_row.h"
-+#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
-+#define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
-+#include "mozilla/SSE.h"
- 
+-
 -#if USE_MMX
 -#if defined(_MSC_VER)
 -#include <intrin.h>
 -#else
 -#include <mmintrin.h>
 -#endif
 -#endif
 -
 -#if USE_SSE2
 -#include <emmintrin.h>
 -#endif
 -
 -namespace media {
+-
++#include "yuv_row.h"
++#include "mozilla/SSE.h"
++
++#ifdef HAVE_YCBCR_TO_RGB565
++void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
++#endif
++
 +namespace mozilla {
- 
++
 +namespace gfx {
 + 
  // 16.16 fixed point arithmetic
  const int kFractionBits = 16;
  const int kFractionMax = 1 << kFractionBits;
  const int kFractionMask = ((1 << kFractionBits) - 1);
  
++
++// Convert a frame of YUV to 16 bit RGB565.
++NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
++                                  const uint8* u_buf,
++                                  const uint8* v_buf,
++                                  uint8* rgb_buf,
++                                  int pic_x,
++                                  int pic_y,
++                                  int pic_width,
++                                  int pic_height,
++                                  int y_pitch,
++                                  int uv_pitch,
++                                  int rgb_pitch,
++                                  YUVType yuv_type)
++{
++#ifdef HAVE_YCBCR_TO_RGB565
++  for (int i = 0; i < pic_height; i++) {
++    yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
++                         y_buf + y_pitch * i,
++                         u_buf + uv_pitch * (i / 2),
++                         v_buf + uv_pitch * (i / 2),
++                         pic_width,
++                         0);
++  }
++#endif
++}
++
  // Convert a frame of YUV to 32 bit ARGB.
 -void ConvertYUVToRGB32(const uint8* y_buf,
 -                       const uint8* u_buf,
 -                       const uint8* v_buf,
 -                       uint8* rgb_buf,
 -                       int width,
 -                       int height,
 -                       int y_pitch,
@@ -60,16 +90,23 @@ index bea0e50..ab4f10a 100644
 -                       int rgb_pitch,
 -                       YUVType yuv_type) {
 -  unsigned int y_shift = yuv_type;
 -  for (int y = 0; y < height; ++y) {
 -    uint8* rgb_row = rgb_buf + y * rgb_pitch;
 -    const uint8* y_ptr = y_buf + y * y_pitch;
 -    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
 -    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
+-
+-    FastConvertYUVToRGB32Row(y_ptr,
+-                             u_ptr,
+-                             v_ptr,
+-                             rgb_row,
+-                             width);
+-  }
 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
 +                                  const uint8* u_buf,
 +                                  const uint8* v_buf,
 +                                  uint8* rgb_buf,
 +                                  int pic_x,
 +                                  int pic_y,
 +                                  int pic_width,
 +                                  int pic_height,
@@ -87,22 +124,17 @@ index bea0e50..ab4f10a 100644
 +  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
 +  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
 +
 +  for (int y = pic_y; y < pic_height + pic_y; ++y) {
 +    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
 +    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
 +    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
 +    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
- 
--    FastConvertYUVToRGB32Row(y_ptr,
--                             u_ptr,
--                             v_ptr,
--                             rgb_row,
--                             width);
++
 +    if (odd_pic_x) {
 +      // Handle the single odd pixel manually and use the
 +      // fast routines for the remaining.
 +      FastConvertYUVToRGB32Row_C(y_ptr++,
 +                                 u_ptr++,
 +                                 v_ptr++,
 +                                 rgb_row,
 +                                 1,
@@ -120,66 +152,150 @@ index bea0e50..ab4f10a 100644
 +    else {
 +      FastConvertYUVToRGB32Row_C(y_ptr,
 +                                 u_ptr,
 +                                 v_ptr,
 +                                 rgb_row,
 +                                 x_width,
 +                                 x_shift);
 +    }
-   }
++  }
  
    // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
 -  EMMS();
+-}
+-
+-#if USE_SSE2
+-// FilterRows combines two rows of the image using linear interpolation.
+-// SSE2 version does 16 pixels at a time
+-
+-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+-                       int source_width, int source_y_fraction) {
+-  __m128i zero = _mm_setzero_si128();
+-  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+-  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+-
+-  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+-  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+-  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+-  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+-
+-  do {
+-    __m128i y0 = _mm_loadu_si128(y0_ptr128);
+-    __m128i y1 = _mm_loadu_si128(y1_ptr128);
+-    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+-    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+-    y0 = _mm_unpacklo_epi8(y0, zero);
+-    y1 = _mm_unpacklo_epi8(y1, zero);
+-    y0 = _mm_mullo_epi16(y0, y0_fraction);
+-    y1 = _mm_mullo_epi16(y1, y1_fraction);
+-    y2 = _mm_mullo_epi16(y2, y0_fraction);
+-    y3 = _mm_mullo_epi16(y3, y1_fraction);
+-    y0 = _mm_add_epi16(y0, y1);
+-    y2 = _mm_add_epi16(y2, y3);
+-    y0 = _mm_srli_epi16(y0, 8);
+-    y2 = _mm_srli_epi16(y2, 8);
+-    y0 = _mm_packus_epi16(y0, y2);
+-    *dest128++ = y0;
+-    ++y0_ptr128;
+-    ++y1_ptr128;
+-  } while (dest128 < end128);
+-}
+-#elif USE_MMX
+-// MMX version does 8 pixels at a time
+-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+-                       int source_width, int source_y_fraction) {
+-  __m64 zero = _mm_setzero_si64();
+-  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+-  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+-
+-  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+-  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+-  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+-  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+-
+-  do {
+-    __m64 y0 = *y0_ptr64++;
+-    __m64 y1 = *y1_ptr64++;
+-    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+-    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+-    y0 = _mm_unpacklo_pi8(y0, zero);
+-    y1 = _mm_unpacklo_pi8(y1, zero);
+-    y0 = _mm_mullo_pi16(y0, y0_fraction);
+-    y1 = _mm_mullo_pi16(y1, y1_fraction);
+-    y2 = _mm_mullo_pi16(y2, y0_fraction);
+-    y3 = _mm_mullo_pi16(y3, y1_fraction);
+-    y0 = _mm_add_pi16(y0, y1);
+-    y2 = _mm_add_pi16(y2, y3);
+-    y0 = _mm_srli_pi16(y0, 8);
+-    y2 = _mm_srli_pi16(y2, 8);
+-    y0 = _mm_packs_pu16(y0, y2);
+-    *dest64++ = y0;
+-  } while (dest64 < end64);
+-}
+-#else  // no MMX or SSE2
 +  if (has_sse)
 +    EMMS();
- }
- 
--#if USE_SSE2
-+#if defined(MOZILLA_COMPILE_WITH_SSE2)
- // FilterRows combines two rows of the image using linear interpolation.
- // SSE2 version does 16 pixels at a time
--
- static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                        int source_width, int source_y_fraction) {
-   __m128i zero = _mm_setzero_si128();
-   __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
-   __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
- 
-   const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
-   const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
-@@ -99,17 +126,17 @@ static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-     y0 = _mm_srli_epi16(y0, 8);
-     y2 = _mm_srli_epi16(y2, 8);
-     y0 = _mm_packus_epi16(y0, y2);
-     *dest128++ = y0;
-     ++y0_ptr128;
-     ++y1_ptr128;
-   } while (dest128 < end128);
- }
--#elif USE_MMX
-+#elif defined(MOZILLA_COMPILE_WITH_MMX)
- // MMX version does 8 pixels at a time
- static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                        int source_width, int source_y_fraction) {
-   __m64 zero = _mm_setzero_si64();
-   __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
-   __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
- 
-   const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
-@@ -154,44 +181,45 @@ static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++}
++
+ // C version does 8 at a time to mimic MMX code
+-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+-                       int source_width, int source_y_fraction) {
++static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++                         int source_width, int source_y_fraction) {
+   int y1_fraction = source_y_fraction;
+   int y0_fraction = 256 - y1_fraction;
+   uint8* end = ybuf + source_width;
+   do {
+     ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+     ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+     ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+     ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
+     ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+     ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
      ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
      y0_ptr += 8;
      y1_ptr += 8;
      ybuf += 8;
    } while (ybuf < end);
  }
- #endif
+-#endif
++
++#ifdef MOZILLA_MAY_SUPPORT_MMX
++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++                    int source_width, int source_y_fraction);
++#endif
++
++#ifdef MOZILLA_MAY_SUPPORT_SSE2
++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++                     int source_width, int source_y_fraction);
++#endif
++
++static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
++                              const uint8* y1_ptr, int source_width,
++                              int source_y_fraction) {
++#ifdef MOZILLA_MAY_SUPPORT_SSE2
++  if (mozilla::supports_sse2()) {
++    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
++    return;
++  }
++#endif
++
++#ifdef MOZILLA_MAY_SUPPORT_MMX
++  if (mozilla::supports_mmx()) {
++    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
++    return;
++  }
++#endif
++
++  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
++}
  
--
+ 
  // Scale a frame of YUV to 32 bit ARGB.
 -void ScaleYUVToRGB32(const uint8* y_buf,
 -                     const uint8* u_buf,
 -                     const uint8* v_buf,
 -                     uint8* rgb_buf,
 -                     int source_width,
 -                     int source_height,
 -                     int width,
@@ -222,17 +338,17 @@ index bea0e50..ab4f10a 100644
    // Diagram showing origin and direction of source sampling.
    // ->0   4<-
    // 7       3
    //
    // 6       5
    // ->1   2<-
    // Rotations that start at right side of image.
    if ((view_rotate == ROTATE_180) ||
-@@ -243,17 +271,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
+@@ -243,17 +262,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
        uv_pitch = 1;
      }
    }
  
    // Need padding because FilterRows() will write 1 to 16 extra pixels
    // after the end for SSE2 version.
    uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
    uint8* ybuf =
@@ -241,17 +357,17 @@ index bea0e50..ab4f10a 100644
    uint8* ubuf = ybuf + kFilterBufferSize;
    uint8* vbuf = ubuf + kFilterBufferSize;
    // TODO(fbarchard): Fixed point math is off by 1 on negatives.
    int yscale_fixed = (source_height << kFractionBits) / height;
  
    // TODO(fbarchard): Split this into separate function for better efficiency.
    for (int y = 0; y < height; ++y) {
      uint8* dest_pixel = rgb_buf + y * rgb_pitch;
-@@ -276,17 +304,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
+@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
      int source_uv_fraction =
          ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
  
      const uint8* y_ptr = y0_ptr;
      const uint8* u_ptr = u0_ptr;
      const uint8* v_ptr = v0_ptr;
      // Apply vertical filtering if necessary.
      // TODO(fbarchard): Remove memcpy when not necessary.
@@ -260,77 +376,111 @@ index bea0e50..ab4f10a 100644
        if (yscale_fixed != kFractionMax &&
            source_y_fraction && ((source_y + 1) < source_height)) {
          FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
        } else {
          memcpy(ybuf, y0_ptr, source_width);
        }
        y_ptr = ybuf;
        ybuf[source_width] = ybuf[source_width-1];
-@@ -309,17 +337,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
+@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
+       u_ptr = ubuf;
+       v_ptr = vbuf;
+       ubuf[uv_source_width] = ubuf[uv_source_width - 1];
+       vbuf[uv_source_width] = vbuf[uv_source_width - 1];
+     }
+     if (source_dx == kFractionMax) {  // Not scaled
        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                                 dest_pixel, width);
-     } else {
-       if (filter & FILTER_BILINEAR_H) {
+-    } else {
+-      if (filter & FILTER_BILINEAR_H) {
++    } else if (filter & FILTER_BILINEAR_H) {
          LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                                   dest_pixel, width, source_dx);
      } else {
  // Specialized scalers and rotation.
 -#if USE_MMX && defined(_MSC_VER)
-+#if defined(_MSC_VER) && defined(_M_IX86)
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
++      if(mozilla::supports_sse()) {
          if (width == (source_width * 2)) {
-           DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                               dest_pixel, width);
+-          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+-                              dest_pixel, width);
++          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
++                                  dest_pixel, width);
          } else if ((source_dx & kFractionMask) == 0) {
            // Scaling by integer scale factor. ie half.
-           ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                                dest_pixel, width,
-                                source_dx >> kFractionBits);
-@@ -331,16 +359,18 @@ void ScaleYUVToRGB32(const uint8* y_buf,
-                                      dest_pixel, width,
-                                      source_dx >> kFractionBits,
-                                      source_dx_uv >> kFractionBits);
+-          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+-                               dest_pixel, width,
+-                               source_dx >> kFractionBits);
++          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
++                                   dest_pixel, width,
++                                   source_dx >> kFractionBits);
+         } else if (source_dx_uv == source_dx) {  // Not rotated.
+           ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                              dest_pixel, width, source_dx);
+         } else {
+-          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+-                                     dest_pixel, width,
+-                                     source_dx >> kFractionBits,
+-                                     source_dx_uv >> kFractionBits);
++          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
++                                         dest_pixel, width,
++                                         source_dx >> kFractionBits,
++                                         source_dx_uv >> kFractionBits);
          }
++      }
++      else {
++        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++                             dest_pixel, width, source_dx);
++      }
  #else
-         ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                            dest_pixel, width, source_dx);
- #endif
+-        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+-                           dest_pixel, width, source_dx);
+-#endif
 -      }
-+      }      
++      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++                         dest_pixel, width, source_dx);
++#endif
      }
    }
    // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
 -  EMMS();
+-}
+-
+-}  // namespace media
 +  if (has_mmx)
 +    EMMS();
- }
- 
--}  // namespace media
++}
++
 +}  // namespace gfx
 +}  // namespace mozilla
-diff --git b/gfx/ycbcr/yuv_convert.h a/gfx/ycbcr/yuv_convert.h
-index 24a2c4e..eb99903 100644
---- b/gfx/ycbcr/yuv_convert.h
-+++ a/gfx/ycbcr/yuv_convert.h
-@@ -1,72 +1,79 @@
+diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
+--- a/gfx/ycbcr/yuv_convert.h
++++ b/gfx/ycbcr/yuv_convert.h
+@@ -1,72 +1,98 @@
  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style license that can be
  // found in the LICENSE file.
  
  #ifndef MEDIA_BASE_YUV_CONVERT_H_
  #define MEDIA_BASE_YUV_CONVERT_H_
  
 -#include "base/basictypes.h"
 -
 -namespace media {
+-
 +#include "chromium_types.h"
 +#include "gfxCore.h"
++
++#ifdef __arm__
++#define HAVE_YCBCR_TO_RGB565 1
++#endif
 + 
 +namespace mozilla {
- 
++
 +namespace gfx {
 + 
  // Type of YUV surface.
  // The value of these enums matter as they are used to shift vertical indices.
  enum YUVType {
 -  YV16 = 0,           // YV16 is half width and full height chroma channels.
 -  YV12 = 1,           // YV12 is half width and half height chroma channels.
 +  YV12 = 0,           // YV12 is half width and half height chroma channels.
@@ -356,16 +506,31 @@ index 24a2c4e..eb99903 100644
  enum ScaleFilter {
    FILTER_NONE = 0,        // No filter (point sampled).
    FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
    FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
 -  FILTER_BILINEAR = 3,    // Bilinear filter.
 +  FILTER_BILINEAR = 3     // Bilinear filter.
  };
  
++// Convert a frame of YUV to 16 bit RGB565.
++// Pass in YV12 formats
++NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
++                                  const uint8* uplane,
++                                  const uint8* vplane,
++                                  uint8* rgbframe,
++                                  int pic_x,
++                                  int pic_y,
++                                  int pic_width,
++                                  int pic_height,
++                                  int ystride,
++                                  int uvstride,
++                                  int rgbstride,
++                                  YUVType yuv_type);
++
  // Convert a frame of YUV to 32 bit ARGB.
  // Pass in YV16/YV12 depending on source format
 -void ConvertYUVToRGB32(const uint8* yplane,
 -                       const uint8* uplane,
 -                       const uint8* vplane,
 -                       uint8* rgbframe,
 -                       int width,
 -                       int height,
@@ -399,40 +564,142 @@ index 24a2c4e..eb99903 100644
 -                     int ystride,
 -                     int uvstride,
 -                     int rgbstride,
 -                     YUVType yuv_type,
 -                     Rotate view_rotate,
 -                     ScaleFilter filter);
 -
 -}  // namespace media
+-
 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
 +                                const uint8* uplane,
 +                                const uint8* vplane,
 +                                uint8* rgbframe,
 +                                int source_width,
 +                                int source_height,
 +                                int width,
 +                                int height,
 +                                int ystride,
 +                                int uvstride,
 +                                int rgbstride,
 +                                YUVType yuv_type,
 +                                Rotate view_rotate,
 +                                ScaleFilter filter);
- 
++
 +}  // namespace gfx
 +}  // namespace mozilla
 + 
  #endif  // MEDIA_BASE_YUV_CONVERT_H_
-diff --git b/gfx/ycbcr/yuv_row.h a/gfx/ycbcr/yuv_row.h
-index 0a2990b..4ce9eb8 100644
---- b/gfx/ycbcr/yuv_row.h
-+++ a/gfx/ycbcr/yuv_row.h
-@@ -5,27 +5,40 @@
+diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_convert_mmx.cpp
+@@ -0,0 +1,45 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++#include <mmintrin.h>
++#include "yuv_row.h"
++
++namespace mozilla {
++namespace gfx {
++
++// FilterRows combines two rows of the image using linear interpolation.
++// MMX version does 8 pixels at a time.
++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++                    int source_width, int source_y_fraction) {
++  __m64 zero = _mm_setzero_si64();
++  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
++  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
++
++  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
++  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
++  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
++  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
++
++  do {
++    __m64 y0 = *y0_ptr64++;
++    __m64 y1 = *y1_ptr64++;
++    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
++    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
++    y0 = _mm_unpacklo_pi8(y0, zero);
++    y1 = _mm_unpacklo_pi8(y1, zero);
++    y0 = _mm_mullo_pi16(y0, y0_fraction);
++    y1 = _mm_mullo_pi16(y1, y1_fraction);
++    y2 = _mm_mullo_pi16(y2, y0_fraction);
++    y3 = _mm_mullo_pi16(y3, y1_fraction);
++    y0 = _mm_add_pi16(y0, y1);
++    y2 = _mm_add_pi16(y2, y3);
++    y0 = _mm_srli_pi16(y0, 8);
++    y2 = _mm_srli_pi16(y2, 8);
++    y0 = _mm_packs_pu16(y0, y2);
++    *dest64++ = y0;
++  } while (dest64 < end64);
++}
++
++}
++}
+diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_convert_sse2.cpp
+@@ -0,0 +1,47 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++#include <emmintrin.h>
++#include "yuv_row.h"
++
++namespace mozilla {
++namespace gfx {
++
++// FilterRows combines two rows of the image using linear interpolation.
++// SSE2 version does 16 pixels at a time.
++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++                     int source_width, int source_y_fraction) {
++  __m128i zero = _mm_setzero_si128();
++  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
++  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
++
++  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
++  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
++  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
++  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
++
++  do {
++    __m128i y0 = _mm_loadu_si128(y0_ptr128);
++    __m128i y1 = _mm_loadu_si128(y1_ptr128);
++    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
++    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
++    y0 = _mm_unpacklo_epi8(y0, zero);
++    y1 = _mm_unpacklo_epi8(y1, zero);
++    y0 = _mm_mullo_epi16(y0, y0_fraction);
++    y1 = _mm_mullo_epi16(y1, y1_fraction);
++    y2 = _mm_mullo_epi16(y2, y0_fraction);
++    y3 = _mm_mullo_epi16(y3, y1_fraction);
++    y0 = _mm_add_epi16(y0, y1);
++    y2 = _mm_add_epi16(y2, y3);
++    y0 = _mm_srli_epi16(y0, 8);
++    y2 = _mm_srli_epi16(y2, 8);
++    y0 = _mm_packus_epi16(y0, y2);
++    *dest128++ = y0;
++    ++y0_ptr128;
++    ++y1_ptr128;
++  } while (dest128 < end128);
++}
++
++}
++}
+diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
+--- a/gfx/ycbcr/yuv_row.h
++++ b/gfx/ycbcr/yuv_row.h
+@@ -5,109 +5,133 @@
  // yuv_row internal functions to handle YUV conversion and scaling to RGB.
  // These functions are used from both yuv_convert.cc and yuv_scale.cc.
  
  // TODO(fbarchard): Write function that can handle rotation and scaling.
  
  #ifndef MEDIA_BASE_YUV_ROW_H_
  #define MEDIA_BASE_YUV_ROW_H_
  
@@ -443,38 +710,84 @@ index 0a2990b..4ce9eb8 100644
  // Can only do 1x.
  // This is the second fastest of the scalers.
  void FastConvertYUVToRGB32Row(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width);
  
+-// Can do 1x, half size or any scale down by an integer amount.
+-// Step can be negative (mirroring, rotate 180).
+-// This is the third fastest of the scalers.
+-void ConvertYUVToRGB32Row(const uint8* y_buf,
+-                          const uint8* u_buf,
+-                          const uint8* v_buf,
+-                          uint8* rgb_buf,
+-                          int width,
+-                          int step);
+-
+-// Rotate is like Convert, but applies different step to Y versus U and V.
+-// This allows rotation by 90 or 270, by stepping by stride.
+-// This is the forth fastest of the scalers.
+-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
-+                                const uint8* u_buf,
-+                                const uint8* v_buf,
-+                                uint8* rgb_buf,
-+                                int width,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+-                                int ystep,
+-                                int uvstep);
 +                                unsigned int x_shift);
 +
 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
 +                              const uint8* u_buf,
 +                              const uint8* v_buf,
 +                              uint8* rgb_buf,
 +                              int width);
 +
- // Can do 1x, half size or any scale down by an integer amount.
- // Step can be negative (mirroring, rotate 180).
- // This is the third fastest of the scalers.
- void ConvertYUVToRGB32Row(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb_buf,
-                           int width,
-@@ -55,59 +68,67 @@ void DoubleYUVToRGB32Row(const uint8* y_buf,
++// Can do 1x, half size or any scale down by an integer amount.
++// Step can be negative (mirroring, rotate 180).
++// This is the third fastest of the scalers.
++// Only defined on Windows x86-32.
++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width,
++                              int step);
++
++// Rotate is like Convert, but applies different step to Y versus U and V.
++// This allows rotation by 90 or 270, by stepping by stride.
++// This is the forth fastest of the scalers.
++// Only defined on Windows x86-32.
++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                                    const uint8* u_buf,
++                                    const uint8* v_buf,
++                                    uint8* rgb_buf,
++                                    int width,
++                                    int ystep,
++                                    int uvstep);
+ 
+ // Doubler does 4 pixels at a time.  Each pixel is replicated.
+ // This is the fastest of the scalers.
+-void DoubleYUVToRGB32Row(const uint8* y_buf,
+-                         const uint8* u_buf,
+-                         const uint8* v_buf,
+-                         uint8* rgb_buf,
+-                         int width);
++// Only defined on Windows x86-32.
++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
++                             const uint8* u_buf,
++                             const uint8* v_buf,
++                             uint8* rgb_buf,
++                             int width);
+ 
+ // Handles arbitrary scaling up or down.
+ // Mirroring is supported, but not 90 or 270 degree rotation.
  // Chroma is under sampled every 2 pixels for performance.
  void ScaleYUVToRGB32Row(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* rgb_buf,
                          int width,
                          int source_dx);
  
@@ -554,31 +867,31 @@ index 0a2990b..4ce9eb8 100644
  #if defined(_MSC_VER)
  #define EMMS() __asm emms
  #pragma warning(disable: 4799)
  #else
  #define EMMS() asm("emms")
  #endif
  #else
  #define EMMS()
-diff --git b/gfx/ycbcr/yuv_row_c.cpp a/gfx/ycbcr/yuv_row_c.cpp
-index a66fa7b..d327f85 100644
---- b/gfx/ycbcr/yuv_row_c.cpp
-+++ a/gfx/ycbcr/yuv_row_c.cpp
+diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
+--- a/gfx/ycbcr/yuv_row_c.cpp
++++ b/gfx/ycbcr/yuv_row_c.cpp
 @@ -1,812 +1,18 @@
  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style license that can be
  // found in the LICENSE file.
  
 -#include "media/base/yuv_row.h"
-+#include "yuv_row.h"
- 
+-
 -#ifdef _DEBUG
 -#include "base/logging.h"
 -#else
++#include "yuv_row.h"
++
  #define DCHECK(a)
 -#endif
  
  extern "C" {
  
 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
 -
 -// AMD64 ABI uses register paremters.
@@ -1463,50 +1776,53 @@ index a66fa7b..d327f85 100644
    int x = 0;
    if (source_dx >= 0x20000) {
      x = 32768;
    }
    for (int i = 0; i < width; i += 2) {
      int y0 = y_buf[x >> 16];
      int y1 = y_buf[(x >> 16) + 1];
      int u0 = u_buf[(x >> 17)];
-@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
        y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
        YuvPixel(y, u, v, rgb_buf+4);
        x += source_dx;
      }
      rgb_buf += 8;
    }
  }
  
 -#endif  // USE_MMX
  }  // extern "C"
  
-diff --git b/gfx/ycbcr/yuv_row_posix.cpp a/gfx/ycbcr/yuv_row_posix.cpp
-index a66fa7b..382c2bd 100644
---- b/gfx/ycbcr/yuv_row_posix.cpp
-+++ a/gfx/ycbcr/yuv_row_posix.cpp
-@@ -1,33 +1,29 @@
+diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
+--- a/gfx/ycbcr/yuv_row_posix.cpp
++++ b/gfx/ycbcr/yuv_row_posix.cpp
+@@ -1,33 +1,32 @@
  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style license that can be
  // found in the LICENSE file.
  
 -#include "media/base/yuv_row.h"
-+#include "yuv_row.h"
- 
+-
 -#ifdef _DEBUG
 -#include "base/logging.h"
 -#else
++#include "yuv_row.h"
++#include "mozilla/SSE.h"
++
  #define DCHECK(a)
 -#endif
  
  extern "C" {
  
 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
 +#if defined(ARCH_CPU_X86_64)
++
++// We don't need CPUID guards here, since x86-64 implies SSE2.
  
  // AMD64 ABI uses register paremters.
  void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
                                const uint8* u_buf,  // rsi
                                const uint8* v_buf,  // rdx
                                uint8* rgb_buf,      // rcx
                                int width) {         // r8
    asm(
@@ -1517,32 +1833,33 @@ index a66fa7b..382c2bd 100644
    "movzb  (%1),%%r10\n"
    "add    $0x1,%1\n"
    "movzb  (%2),%%r11\n"
    "add    $0x1,%2\n"
    "movq   2048(%5,%%r10,8),%%xmm0\n"
    "movzb  (%0),%%r10\n"
    "movq   4096(%5,%%r11,8),%%xmm1\n"
    "movzb  0x1(%0),%%r11\n"
-@@ -37,36 +33,36 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
+@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
    "movq   (%5,%%r11,8),%%xmm3\n"
    "paddsw %%xmm0,%%xmm2\n"
    "paddsw %%xmm0,%%xmm3\n"
    "shufps $0x44,%%xmm3,%%xmm2\n"
    "psraw  $0x6,%%xmm2\n"
    "packuswb %%xmm2,%%xmm2\n"
    "movq   %%xmm2,0x0(%3)\n"
    "add    $0x8,%3\n"
 -"convertend:"
 +"1:"
    "sub    $0x2,%4\n"
 -  "jns    convertloop\n"
+-
+-"convertnext:"
 +  "jns    0b\n"
- 
--"convertnext:"
++
 +"2:"
    "add    $0x1,%4\n"
 -  "js     convertdone\n"
 +  "js     3f\n"
  
    "movzb  (%1),%%r10\n"
    "movq   2048(%5,%%r10,8),%%xmm0\n"
    "movzb  (%2),%%r10\n"
@@ -1559,51 +1876,53 @@ index a66fa7b..382c2bd 100644
    :
    : "r"(y_buf),  // %0
      "r"(u_buf),  // %1
      "r"(v_buf),  // %2
      "r"(rgb_buf),  // %3
      "r"(width),  // %4
      "r" (kCoefficientsRgbY)  // %5
    : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-@@ -77,19 +73,19 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
+@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
                          const uint8* u_buf,  // rsi
                          const uint8* v_buf,  // rdx
                          uint8* rgb_buf,      // rcx
                          int width,           // r8
                          int source_dx) {     // r9
    asm(
    "xor    %%r11,%%r11\n"
    "sub    $0x2,%4\n"
 -  "js     scalenext\n"
+-
+-"scaleloop:"
 +  "js     1f\n"
- 
--"scaleloop:"
++
 +"0:"
    "mov    %%r11,%%r10\n"
    "sar    $0x11,%%r10\n"
    "movzb  (%1,%%r10,1),%%rax\n"
    "movq   2048(%5,%%rax,8),%%xmm0\n"
    "movzb  (%2,%%r10,1),%%rax\n"
    "movq   4096(%5,%%rax,8),%%xmm1\n"
    "lea    (%%r11,%6),%%r10\n"
    "sar    $0x10,%%r11\n"
-@@ -103,38 +99,38 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
+@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
    "paddsw %%xmm0,%%xmm1\n"
    "paddsw %%xmm0,%%xmm2\n"
    "shufps $0x44,%%xmm2,%%xmm1\n"
    "psraw  $0x6,%%xmm1\n"
    "packuswb %%xmm1,%%xmm1\n"
    "movq   %%xmm1,0x0(%3)\n"
    "add    $0x8,%3\n"
    "sub    $0x2,%4\n"
 -  "jns    scaleloop\n"
+-
+-"scalenext:"
 +  "jns    0b\n"
- 
--"scalenext:"
++
 +"1:"
    "add    $0x1,%4\n"
 -  "js     scaledone\n"
 +  "js     2f\n"
  
    "mov    %%r11,%%r10\n"
    "sar    $0x11,%%r10\n"
    "movzb  (%1,%%r10,1),%%rax\n"
@@ -1624,70 +1943,72 @@ index a66fa7b..382c2bd 100644
    :
    : "r"(y_buf),  // %0
      "r"(u_buf),  // %1
      "r"(v_buf),  // %2
      "r"(rgb_buf),  // %3
      "r"(width),  // %4
      "r" (kCoefficientsRgbY),  // %5
      "r"(static_cast<long>(source_dx))  // %6
-@@ -146,23 +142,23 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width,
                                int source_dx) {
    asm(
    "xor    %%r11,%%r11\n"   // x = 0
    "sub    $0x2,%4\n"
 -  "js     .lscalenext\n"
 +  "js     2f\n"
    "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
 -  "jl     .lscalehalf\n"
 +  "jl     0f\n"
    "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
 -".lscalehalf:"
+-
+-".lscaleloop:"
 +"0:"
- 
--".lscaleloop:"
++
 +"1:"
    "mov    %%r11,%%r10\n"
    "sar    $0x11,%%r10\n"
  
    "movzb  (%1, %%r10, 1), %%r13 \n"
    "movzb  1(%1, %%r10, 1), %%r14 \n"
    "mov    %%r11, %%rax \n"
    "and    $0x1fffe, %%rax \n"
    "imul   %%rax, %%r14 \n"
-@@ -215,21 +211,21 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
    "paddsw %%xmm0,%%xmm1\n"
    "paddsw %%xmm0,%%xmm2\n"
    "shufps $0x44,%%xmm2,%%xmm1\n"
    "psraw  $0x6,%%xmm1\n"
    "packuswb %%xmm1,%%xmm1\n"
    "movq   %%xmm1,0x0(%3)\n"
    "add    $0x8,%3\n"
    "sub    $0x2,%4\n"
 -  "jns    .lscaleloop\n"
+-
+-".lscalenext:"
 +  "jns    1b\n"
- 
--".lscalenext:"
++
 +"2:"
    "add    $0x1,%4\n"
 -  "js     .lscaledone\n"
 +  "js     3f\n"
  
    "mov    %%r11,%%r10\n"
    "sar    $0x11,%%r10\n"
  
    "movzb  (%1,%%r10,1), %%r13 \n"
    "movq   2048(%5,%%r13,8),%%xmm0\n"
  
    "movzb  (%2,%%r10,1), %%r13 \n"
-@@ -241,52 +237,52 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
    "movzb  (%0,%%r11,1), %%r13 \n"
    "movq   (%5,%%r13,8),%%xmm1\n"
  
    "paddsw %%xmm0,%%xmm1\n"
    "psraw  $0x6,%%xmm1\n"
    "packuswb %%xmm1,%%xmm1\n"
    "movd   %%xmm1,0x0(%3)\n"
  
@@ -1701,51 +2022,59 @@ index a66fa7b..382c2bd 100644
      "r"(width),  // %4
      "r" (kCoefficientsRgbY),  // %5
      "r"(static_cast<long>(source_dx))  // %6
    : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
  );
  }
  
 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
-+#elif defined(ARCH_CPU_X86_32) && !defined(__PIC__)
++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
  
  // PIC version is slower because less registers are available, so
  // non-PIC is used on platforms where it is possible.
 -
- void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width);
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+-                              const uint8* u_buf,
+-                              const uint8* v_buf,
+-                              uint8* rgb_buf,
+-                              int width);
++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                                  const uint8* u_buf,
++                                  const uint8* v_buf,
++                                  uint8* rgb_buf,
++                                  int width);
    asm(
    ".text\n"
-   ".global FastConvertYUVToRGB32Row\n"
-+  ".type FastConvertYUVToRGB32Row, @function\n"
- "FastConvertYUVToRGB32Row:\n"
+-  ".global FastConvertYUVToRGB32Row\n"
+-"FastConvertYUVToRGB32Row:\n"
++  ".global FastConvertYUVToRGB32Row_SSE\n"
++  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
++"FastConvertYUVToRGB32Row_SSE:\n"
    "pusha\n"
    "mov    0x24(%esp),%edx\n"
    "mov    0x28(%esp),%edi\n"
    "mov    0x2c(%esp),%esi\n"
    "mov    0x30(%esp),%ebp\n"
    "mov    0x34(%esp),%ecx\n"
 -  "jmp    convertend\n"
+-
+-"convertloop:"
 +  "jmp    1f\n"
- 
--"convertloop:"
++
 +"0:"
    "movzbl (%edi),%eax\n"
    "add    $0x1,%edi\n"
    "movzbl (%esi),%ebx\n"
    "add    $0x1,%esi\n"
    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    "movzbl (%edx),%eax\n"
    "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
    "movzbl 0x1(%edx),%ebx\n"
-@@ -295,59 +291,63 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
    "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
    "paddsw %mm0,%mm1\n"
    "paddsw %mm0,%mm2\n"
    "psraw  $0x6,%mm1\n"
    "psraw  $0x6,%mm2\n"
    "packuswb %mm2,%mm1\n"
    "movntq %mm1,0x0(%ebp)\n"
    "add    $0x8,%ebp\n"
@@ -1773,49 +2102,73 @@ index a66fa7b..382c2bd 100644
 +"2:"
    "popa\n"
    "ret\n"
 +#if !defined(XP_MACOSX)
 +  ".previous\n"
 +#endif
  );
  
- 
- void ScaleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width,
-                         int source_dx);
+-
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+-                        const uint8* u_buf,
+-                        const uint8* v_buf,
+-                        uint8* rgb_buf,
+-                        int width,
+-                        int source_dx);
++void FastConvertYUVToRGB32Row(const uint8* y_buf,
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width)
++{
++  if (mozilla::supports_sse()) {
++    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
++    return;
++  }
++
++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
++
++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++                            const uint8* u_buf,
++                            const uint8* v_buf,
++                            uint8* rgb_buf,
++                            int width,
++                            int source_dx);
    asm(
    ".text\n"
-   ".global ScaleYUVToRGB32Row\n"
-+  ".type ScaleYUVToRGB32Row, @function\n"
- "ScaleYUVToRGB32Row:\n"
+-  ".global ScaleYUVToRGB32Row\n"
+-"ScaleYUVToRGB32Row:\n"
++  ".global ScaleYUVToRGB32Row_SSE\n"
++  ".type ScaleYUVToRGB32Row_SSE, @function\n"
++"ScaleYUVToRGB32Row_SSE:\n"
    "pusha\n"
    "mov    0x24(%esp),%edx\n"
    "mov    0x28(%esp),%edi\n"
    "mov    0x2c(%esp),%esi\n"
    "mov    0x30(%esp),%ebp\n"
    "mov    0x34(%esp),%ecx\n"
    "xor    %ebx,%ebx\n"
 -  "jmp    scaleend\n"
+-
+-"scaleloop:"
 +  "jmp    1f\n"
- 
--"scaleloop:"
++
 +"0:"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%edi,%eax,1),%eax\n"
    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%esi,%eax,1),%eax\n"
    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
-@@ -363,22 +363,22 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
    "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
    "paddsw %mm0,%mm1\n"
    "paddsw %mm0,%mm2\n"
    "psraw  $0x6,%mm1\n"
    "psraw  $0x6,%mm2\n"
    "packuswb %mm2,%mm1\n"
    "movntq %mm1,0x0(%ebp)\n"
    "add    $0x8,%ebp\n"
@@ -1831,17 +2184,17 @@ index a66fa7b..382c2bd 100644
  
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%edi,%eax,1),%eax\n"
    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%esi,%eax,1),%eax\n"
-@@ -387,49 +387,53 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
    "sar    $0x10,%eax\n"
    "movzbl (%edx,%eax,1),%eax\n"
    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
    "paddsw %mm0,%mm1\n"
    "psraw  $0x6,%mm1\n"
    "packuswb %mm1,%mm1\n"
    "movd   %mm1,0x0(%ebp)\n"
  
@@ -1849,27 +2202,51 @@ index a66fa7b..382c2bd 100644
 +"2:"
    "popa\n"
    "ret\n"
 +#if !defined(XP_MACOSX)
 +  ".previous\n"
 +#endif
  );
  
- void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width,
-                               int source_dx);
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+-                              const uint8* u_buf,
+-                              const uint8* v_buf,
+-                              uint8* rgb_buf,
+-                              int width,
+-                              int source_dx);
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int source_dx)
++{
++  if (mozilla::supports_sse()) {
++    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
++                           width, source_dx);
++  }
++
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
++                       width, source_dx);
++}
++
++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++                                  const uint8* u_buf,
++                                  const uint8* v_buf,
++                                  uint8* rgb_buf,
++                                  int width,
++                                  int source_dx);
    asm(
    ".text\n"
-   ".global LinearScaleYUVToRGB32Row\n"
-+  ".type LinearScaleYUVToRGB32Row, @function\n"
- "LinearScaleYUVToRGB32Row:\n"
+-  ".global LinearScaleYUVToRGB32Row\n"
+-"LinearScaleYUVToRGB32Row:\n"
++  ".global LinearScaleYUVToRGB32Row_SSE\n"
++  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
++"LinearScaleYUVToRGB32Row_SSE:\n"
    "pusha\n"
    "mov    0x24(%esp),%edx\n"
    "mov    0x28(%esp),%edi\n"
    "mov    0x30(%esp),%ebp\n"
  
    // source_width = width * source_dx + ebx
    "mov    0x34(%esp), %ecx\n"
    "imull  0x38(%esp), %ecx\n"
@@ -1877,29 +2254,34 @@ index a66fa7b..382c2bd 100644
  
    "mov    0x38(%esp), %ecx\n"
    "xor    %ebx,%ebx\n"     // x = 0
    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
 -  "jl     .lscaleend\n"
 +  "jl     1f\n"
    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
 -  "jmp    .lscaleend\n"
-+  "jmp    1f\n"
- 
+-
 -".lscaleloop:"
+-  "mov    %ebx,%eax\n"
+-  "sar    $0x11,%eax\n"
++  "jmp    1f\n"
++
 +"0:"
-   "mov    %ebx,%eax\n"
-   "sar    $0x11,%eax\n"
++  "mov    %ebx,%eax\n"
++  "sar    $0x11,%eax\n"
  
    "movzbl (%edi,%eax,1),%ecx\n"
    "movzbl 1(%edi,%eax,1),%esi\n"
    "mov    %ebx,%eax\n"
    "andl   $0x1fffe, %eax \n"
    "imul   %eax, %esi \n"
-@@ -464,17 +468,17 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+   "xorl   $0x1fffe, %eax \n"
+   "imul   %eax, %ecx \n"
+@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
    "imul   %eax, %esi \n"
    "xorl   $0xffff, %eax \n"
    "imul   %eax, %ecx \n"
    "addl   %esi, %ecx \n"
    "shrl   $16, %ecx \n"
    "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
  
    "cmp    0x34(%esp), %ebx\n"
@@ -1908,17 +2290,17 @@ index a66fa7b..382c2bd 100644
  
    "mov    %ebx,%eax\n"
    "sar    $0x10,%eax\n"
    "movzbl (%edx,%eax,1),%ecx\n"
    "movzbl 1(%edx,%eax,1),%esi\n"
    "mov    %ebx,%eax\n"
    "add    0x38(%esp),%ebx\n"
    "andl   $0xffff, %eax \n"
-@@ -488,56 +492,60 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
    "paddsw %mm0,%mm1\n"
    "paddsw %mm0,%mm2\n"
    "psraw  $0x6,%mm1\n"
    "psraw  $0x6,%mm2\n"
    "packuswb %mm2,%mm1\n"
    "movntq %mm1,0x0(%ebp)\n"
    "add    $0x8,%ebp\n"
  
@@ -1939,60 +2321,80 @@ index a66fa7b..382c2bd 100644
    "popa\n"
    "ret\n"
 +#if !defined(XP_MACOSX)
 +  ".previous\n"
 +#endif
  );
  
 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
-+#elif defined(ARCH_CPU_X86_32) && defined(__PIC__)
-+
-+void PICConvertYUVToRGB32Row(const uint8* y_buf,
-+                             const uint8* u_buf,
-+                             const uint8* v_buf,
-+                             uint8* rgb_buf,
-+                             int width,
-+                             int16 *kCoefficientsRgbY);
- 
+-
 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
 -                                    const uint8* u_buf,
 -                                    const uint8* v_buf,
 -                                    uint8* rgb_buf,
 -                                    int width,
 -                                    int16 *kCoefficientsRgbY);
++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width,
++                              int source_dx)
++{
++  if (mozilla::supports_sse()) {
++    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
++                                 width, source_dx);
++  }
++
++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
++                             width, source_dx);
++}
++
++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
++
++void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                                 const uint8* u_buf,
++                                 const uint8* v_buf,
++                                 uint8* rgb_buf,
++                                 int width,
++                                 int16 *kCoefficientsRgbY);
++
    asm(
    ".text\n"
 -#if defined(OS_MACOSX)
+-"_PICConvertYUVToRGB32Row:\n"
 +#if defined(XP_MACOSX)
- "_PICConvertYUVToRGB32Row:\n"
++"_PICConvertYUVToRGB32Row_SSE:\n"
  #else
- "PICConvertYUVToRGB32Row:\n"
+-"PICConvertYUVToRGB32Row:\n"
++"PICConvertYUVToRGB32Row_SSE:\n"
  #endif
    "pusha\n"
    "mov    0x24(%esp),%edx\n"
    "mov    0x28(%esp),%edi\n"
    "mov    0x2c(%esp),%esi\n"
    "mov    0x30(%esp),%ebp\n"
    "mov    0x38(%esp),%ecx\n"
  
 -  "jmp    .Lconvertend\n"
+-
+-".Lconvertloop:"
 +  "jmp    1f\n"
- 
--".Lconvertloop:"
++
 +"0:"
    "movzbl (%edi),%eax\n"
    "add    $0x1,%edi\n"
    "movzbl (%esi),%ebx\n"
    "add    $0x1,%esi\n"
    "movq   2048(%ecx,%eax,8),%mm0\n"
    "movzbl (%edx),%eax\n"
    "paddsw 4096(%ecx,%ebx,8),%mm0\n"
    "movzbl 0x1(%edx),%ebx\n"
-@@ -546,72 +554,75 @@ extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
+@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
    "movq   0(%ecx,%ebx,8),%mm2\n"
    "paddsw %mm0,%mm1\n"
    "paddsw %mm0,%mm2\n"
    "psraw  $0x6,%mm1\n"
    "psraw  $0x6,%mm2\n"
    "packuswb %mm2,%mm1\n"
    "movntq %mm1,0x0(%ebp)\n"
    "add    $0x8,%ebp\n"
@@ -2024,65 +2426,73 @@ index a66fa7b..382c2bd 100644
 +  ".previous\n"
 +#endif
  );
  
  void FastConvertYUVToRGB32Row(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
-                               int width) {
-   PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
-                           &kCoefficientsRgbY[0][0]);
- }
- 
+-                              int width) {
+-  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
+-                          &kCoefficientsRgbY[0][0]);
+-}
+-
 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
--                               const uint8* u_buf,
--                               const uint8* v_buf,
--                               uint8* rgb_buf,
--                               int width,
--                               int source_dx,
--                               int16 *kCoefficientsRgbY);
-+void PICScaleYUVToRGB32Row(const uint8* y_buf,
-+                           const uint8* u_buf,
-+                           const uint8* v_buf,
-+                           uint8* rgb_buf,
-+                           int width,
-+                           int source_dx,
-+                           int16 *kCoefficientsRgbY);
++                              int width)
++{
++  if (mozilla::supports_sse()) {
++    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
++                                &kCoefficientsRgbY[0][0]);
++    return;
++  }
++
++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
++void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx,
+                                int16 *kCoefficientsRgbY);
  
    asm(
    ".text\n"
 -#if defined(OS_MACOSX)
+-"_PICScaleYUVToRGB32Row:\n"
 +#if defined(XP_MACOSX)
- "_PICScaleYUVToRGB32Row:\n"
++"_PICScaleYUVToRGB32Row_SSE:\n"
  #else
- "PICScaleYUVToRGB32Row:\n"
+-"PICScaleYUVToRGB32Row:\n"
++"PICScaleYUVToRGB32Row_SSE:\n"
  #endif
    "pusha\n"
    "mov    0x24(%esp),%edx\n"
    "mov    0x28(%esp),%edi\n"
    "mov    0x2c(%esp),%esi\n"
    "mov    0x30(%esp),%ebp\n"
    "mov    0x3c(%esp),%ecx\n"
    "xor    %ebx,%ebx\n"
 -  "jmp    Lscaleend\n"
+-
+-"Lscaleloop:"
 +  "jmp    1f\n"
- 
--"Lscaleloop:"
++
 +"0:"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%edi,%eax,1),%eax\n"
    "movq   2048(%ecx,%eax,8),%mm0\n"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%esi,%eax,1),%eax\n"
    "paddsw 4096(%ecx,%eax,8),%mm0\n"
-@@ -627,22 +638,22 @@ extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const 
    "movq   0(%ecx,%eax,8),%mm2\n"
    "paddsw %mm0,%mm1\n"
    "paddsw %mm0,%mm2\n"
    "psraw  $0x6,%mm1\n"
    "psraw  $0x6,%mm2\n"
    "packuswb %mm2,%mm1\n"
    "movntq %mm1,0x0(%ebp)\n"
    "add    $0x8,%ebp\n"
@@ -2098,17 +2508,17 @@ index a66fa7b..382c2bd 100644
  
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%edi,%eax,1),%eax\n"
    "movq   2048(%ecx,%eax,8),%mm0\n"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
    "movzbl (%esi,%eax,1),%eax\n"
-@@ -651,22 +662,24 @@ extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const 
    "sar    $0x10,%eax\n"
    "movzbl (%edx,%eax,1),%eax\n"
    "movq   0(%ecx,%eax,8),%mm1\n"
    "paddsw %mm0,%mm1\n"
    "psraw  $0x6,%mm1\n"
    "packuswb %mm1,%mm1\n"
    "movd   %mm1,0x0(%ebp)\n"
  
@@ -2122,67 +2532,90 @@ index a66fa7b..382c2bd 100644
  );
  
 -
  void ScaleYUVToRGB32Row(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* rgb_buf,
                          int width,
-                         int source_dx) {
-   PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                         &kCoefficientsRgbY[0][0]);
-@@ -674,19 +687,20 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
- 
- void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
-                                  const uint8* u_buf,
-                                  const uint8* v_buf,
-                                  uint8* rgb_buf,
-                                  int width,
-                                  int source_dx,
-                                  int16 *kCoefficientsRgbY);
+-                        int source_dx) {
+-  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+-                        &kCoefficientsRgbY[0][0]);
+-}
+-
+-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+-                                 const uint8* u_buf,
+-                                 const uint8* v_buf,
+-                                 uint8* rgb_buf,
+-                                 int width,
+-                                 int source_dx,
+-                                 int16 *kCoefficientsRgbY);
++                        int source_dx)
++{
++  if (mozilla::supports_sse()) {
++    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
++                              &kCoefficientsRgbY[0][0]);
++    return;
++  }
++
++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++
++void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++                                     const uint8* u_buf,
++                                     const uint8* v_buf,
++                                     uint8* rgb_buf,
++                                     int width,
++                                     int source_dx,
++                                     int16 *kCoefficientsRgbY);
 +
    asm(
    ".text\n"
 -#if defined(OS_MACOSX)
+-"_PICLinearScaleYUVToRGB32Row:\n"
 +#if defined(XP_MACOSX)
- "_PICLinearScaleYUVToRGB32Row:\n"
++"_PICLinearScaleYUVToRGB32Row_SSE:\n"
  #else
- "PICLinearScaleYUVToRGB32Row:\n"
+-"PICLinearScaleYUVToRGB32Row:\n"
++"PICLinearScaleYUVToRGB32Row_SSE:\n"
  #endif
    "pusha\n"
    "mov    0x24(%esp),%edx\n"
    "mov    0x30(%esp),%ebp\n"
    "mov    0x34(%esp),%ecx\n"
-@@ -696,21 +710,21 @@ void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+   "mov    0x3c(%esp),%edi\n"
+   "xor    %ebx,%ebx\n"
+ 
    // source_width = width * source_dx + ebx
    "mov    0x34(%esp), %ecx\n"
    "imull  0x38(%esp), %ecx\n"
    "mov    %ecx, 0x34(%esp)\n"
  
    "mov    0x38(%esp), %ecx\n"
    "xor    %ebx,%ebx\n"     // x = 0
    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
 -  "jl     .lscaleend\n"
 +  "jl     1f\n"
    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
 -  "jmp    .lscaleend\n"
+-
+-".lscaleloop:"
 +  "jmp    1f\n"
- 
--".lscaleloop:"
++
 +"0:"
    "mov    0x28(%esp),%esi\n"
    "mov    %ebx,%eax\n"
    "sar    $0x11,%eax\n"
  
    "movzbl (%esi,%eax,1),%ecx\n"
    "movzbl 1(%esi,%eax,1),%esi\n"
    "mov    %ebx,%eax\n"
    "andl   $0x1fffe, %eax \n"
-@@ -746,17 +760,17 @@ void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
    "imul   %eax, %esi \n"
    "xorl   $0xffff, %eax \n"
    "imul   %eax, %ecx \n"
    "addl   %esi, %ecx \n"
    "shrl   $16, %ecx \n"
    "movq   (%edi,%ecx,8),%mm1\n"
  
    "cmp    0x34(%esp), %ebx\n"
@@ -2191,17 +2624,17 @@ index a66fa7b..382c2bd 100644
  
    "mov    %ebx,%eax\n"
    "sar    $0x10,%eax\n"
    "movzbl (%edx,%eax,1),%ecx\n"
    "movzbl 1(%edx,%eax,1),%esi\n"
    "mov    %ebx,%eax\n"
    "add    0x38(%esp),%ebx\n"
    "andl   $0xffff, %eax \n"
-@@ -770,154 +784,66 @@ void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
    "paddsw %mm0,%mm1\n"
    "paddsw %mm0,%mm2\n"
    "psraw  $0x6,%mm1\n"
    "psraw  $0x6,%mm2\n"
    "packuswb %mm2,%mm1\n"
    "movntq %mm1,0x0(%ebp)\n"
    "add    $0x8,%ebp\n"
  
@@ -2228,24 +2661,19 @@ index a66fa7b..382c2bd 100644
  
 +
  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
 -                        const uint8* u_buf,
 -                        const uint8* v_buf,
 -                        uint8* rgb_buf,
 -                        int width,
 -                        int source_dx) {
-+                              const uint8* u_buf,
-+                              const uint8* v_buf,
-+                              uint8* rgb_buf,
-+                              int width,
-+                              int source_dx) {
-   PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                               &kCoefficientsRgbY[0][0]);
- }
+-  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+-                              &kCoefficientsRgbY[0][0]);
+-}
 -
 -#else  // USE_MMX
 -
 -// C reference code that mimic the YUV assembly.
 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
 -
@@ -2275,16 +2703,30 @@ index a66fa7b..382c2bd 100644
 -  a >>= 6;
 -
 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
 -                                        (packuswb(g) << 8) |
 -                                        (packuswb(r) << 16) |
 -                                        (packuswb(a) << 24);
 -}
 -
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width,
++                              int source_dx)
++{
++  if (mozilla::supports_sse()) {
++    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
++                                    source_dx, &kCoefficientsRgbY[0][0]);
++    return;
++  }
++
++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
 +#else
  void FastConvertYUVToRGB32Row(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width) {
 -  for (int x = 0; x < width; x += 2) {
 -    uint8 u = u_buf[x >> 1];
@@ -2292,23 +2734,25 @@ index a66fa7b..382c2bd 100644
 -    uint8 y0 = y_buf[x];
 -    YuvPixel(y0, u, v, rgb_buf);
 -    if ((x + 1) < width) {
 -      uint8 y1 = y_buf[x + 1];
 -      YuvPixel(y1, u, v, rgb_buf + 4);
 -    }
 -    rgb_buf += 8;  // Advance 2 pixels.
 -  }
-+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
- }
- 
+-}
+-
 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
 -// A shift by 17 is used to further subsample the chrominence channels.
 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
 -// for 1/65536 pixel accurate interpolation.
++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
  void ScaleYUVToRGB32Row(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* rgb_buf,
                          int width,
                          int source_dx) {
 -  int x = 0;
 -  for (int i = 0; i < width; i += 2) {
@@ -2319,18 +2763,19 @@ index a66fa7b..382c2bd 100644
 -    x += source_dx;
 -    if ((i + 1) < width) {
 -      y = y_buf[x >> 16];
 -      YuvPixel(y, u, v, rgb_buf+4);
 -      x += source_dx;
 -    }
 -    rgb_buf += 8;
 -  }
+-}
 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
- }
++}
  
  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width,
                                int source_dx) {
 -  int x = 0;
@@ -2356,74 +2801,235 @@ index a66fa7b..382c2bd 100644
 -      y1 = y_buf[(x >> 16) + 1];
 -      y_frac = (x & 65535);
 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
 -      YuvPixel(y, u, v, rgb_buf+4);
 -      x += source_dx;
 -    }
 -    rgb_buf += 8;
 -  }
-+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
- }
-+#endif
- 
+-}
+-
 -#endif  // USE_MMX
- }  // extern "C"
- 
-diff --git b/gfx/ycbcr/yuv_row_table.cpp a/gfx/ycbcr/yuv_row_table.cpp
-index 296380b..ad71341 100644
---- b/gfx/ycbcr/yuv_row_table.cpp
-+++ a/gfx/ycbcr/yuv_row_table.cpp
+-}  // extern "C"
+-
++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++#endif
++
++}
+diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
+--- a/gfx/ycbcr/yuv_row_table.cpp
++++ b/gfx/ycbcr/yuv_row_table.cpp
 @@ -1,13 +1,13 @@
  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style license that can be
  // found in the LICENSE file.
  
 -#include "media/base/yuv_row.h"
 +#include "yuv_row.h"
  
  extern "C" {
  
  #define RGBY(i) { \
    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
    0 \
-diff --git b/gfx/ycbcr/yuv_row_win.cpp a/gfx/ycbcr/yuv_row_win.cpp
-index b5049a5..627b8cb 100644
---- b/gfx/ycbcr/yuv_row_win.cpp
-+++ a/gfx/ycbcr/yuv_row_win.cpp
-@@ -1,20 +1,23 @@
+diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
+--- a/gfx/ycbcr/yuv_row_win.cpp
++++ b/gfx/ycbcr/yuv_row_win.cpp
+@@ -1,26 +1,27 @@
  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style license that can be
  // found in the LICENSE file.
  
 -#include "media/base/yuv_row.h"
 +#include "yuv_row.h"
-+#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
-+#define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
 +#include "mozilla/SSE.h"
-+
  
  #define kCoefficientsRgbU kCoefficientsRgbY + 2048
  #define kCoefficientsRgbV kCoefficientsRgbY + 4096
  
  extern "C" {
--
+ 
 -#if USE_MMX
-+#if defined(MOZILLA_COMPILE_WITH_SSE2) && defined(_M_IX86)
+-__declspec(naked)
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+-                              const uint8* u_buf,
+-                              const uint8* v_buf,
+-                              uint8* rgb_buf,
+-                              int width) {
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++__declspec(naked)
++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                                  const uint8* u_buf,
++                                  const uint8* v_buf,
++                                  uint8* rgb_buf,
++                                  int width) {
+   __asm {
+     pushad
+     mov       edx, [esp + 32 + 4]   // Y
+     mov       edi, [esp + 32 + 8]   // U
+     mov       esi, [esp + 32 + 12]  // V
+     mov       ebp, [esp + 32 + 16]  // rgb
+     mov       ecx, [esp + 32 + 20]  // width
+     jmp       convertend
+@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
+  convertdone :
+ 
+     popad
+     ret
+   }
+ }
+ 
  __declspec(naked)
- void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* rgb_buf,
-                               int width) {
+-void ConvertYUVToRGB32Row(const uint8* y_buf,
+-                          const uint8* u_buf,
+-                          const uint8* v_buf,
+-                          uint8* rgb_buf,
+-                          int width,
+-                          int step) {
++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width,
++                              int step) {
+   __asm {
+     pushad
+     mov       edx, [esp + 32 + 4]   // Y
+     mov       edi, [esp + 32 + 8]   // U
+     mov       esi, [esp + 32 + 12]  // V
+     mov       ebp, [esp + 32 + 16]  // rgb
+     mov       ecx, [esp + 32 + 20]  // width
+     mov       ebx, [esp + 32 + 24]  // step
+@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
+  wdone :
+ 
+     popad
+     ret
+   }
+ }
+ 
+ __declspec(naked)
+-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+-                                const uint8* u_buf,
+-                                const uint8* v_buf,
+-                                uint8* rgb_buf,
+-                                int width,
+-                                int ystep,
+-                                int uvstep) {
++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++                                    const uint8* u_buf,
++                                    const uint8* v_buf,
++                                    uint8* rgb_buf,
++                                    int width,
++                                    int ystep,
++                                    int uvstep) {
    __asm {
      pushad
-@@ -438,152 +441,37 @@ lscalelastpixel:
+     mov       edx, [esp + 32 + 4]   // Y
+     mov       edi, [esp + 32 + 8]   // U
+     mov       esi, [esp + 32 + 12]  // V
+     mov       ebp, [esp + 32 + 16]  // rgb
+     mov       ecx, [esp + 32 + 20]  // width
+     jmp       wend
+@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
+  wdone :
+ 
+     popad
+     ret
+   }
+ }
+ 
+ __declspec(naked)
+-void DoubleYUVToRGB32Row(const uint8* y_buf,
+-                         const uint8* u_buf,
+-                         const uint8* v_buf,
+-                         uint8* rgb_buf,
+-                         int width) {
++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
++                             const uint8* u_buf,
++                             const uint8* v_buf,
++                             uint8* rgb_buf,
++                             int width) {
+   __asm {
+     pushad
+     mov       edx, [esp + 32 + 4]   // Y
+     mov       edi, [esp + 32 + 8]   // U
+     mov       esi, [esp + 32 + 12]  // V
+     mov       ebp, [esp + 32 + 16]  // rgb
+     mov       ecx, [esp + 32 + 20]  // width
+     jmp       wend
+@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
+     jns       wloop1
+  wdone :
+     popad
+     ret
+   }
+ }
+ 
+ // This version does general purpose scaling by any amount, up or down.
+-// The only thing it can not do it rotation by 90 or 270.
+-// For performance the chroma is under sampled, reducing cost of a 3x
++// The only thing it cannot do is rotation by 90 or 270.
++// For performance the chroma is under-sampled, reducing cost of a 3x
+ // 1080p scale from 8.4 ms to 5.4 ms.
+ __declspec(naked)
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+-                        const uint8* u_buf,
+-                        const uint8* v_buf,
+-                        uint8* rgb_buf,
+-                        int width,
+-                        int source_dx) {
++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++                            const uint8* u_buf,
++                            const uint8* v_buf,
++                            uint8* rgb_buf,
++                            int width,
++                            int source_dx) {
+   __asm {
+     pushad
+     mov       edx, [esp + 32 + 4]   // Y
+     mov       edi, [esp + 32 + 8]   // U
+     mov       esi, [esp + 32 + 12]  // V
+     mov       ebp, [esp + 32 + 16]  // rgb
+     mov       ecx, [esp + 32 + 20]  // width
+     xor       ebx, ebx              // x
+@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
+ 
+  scaledone :
+     popad
+     ret
+   }
+ }
+ 
+ __declspec(naked)
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+-                              const uint8* u_buf,
+-                              const uint8* v_buf,
+-                              uint8* rgb_buf,
+-                              int width,
+-                              int source_dx) {
++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++                                  const uint8* u_buf,
++                                  const uint8* v_buf,
++                                  uint8* rgb_buf,
++                                  int width,
++                                  int source_dx) {
+   __asm {
+     pushad
+     mov       edx, [esp + 32 + 4]  // Y
+     mov       edi, [esp + 32 + 8]  // U
+                 // [esp + 32 + 12] // V
+     mov       ebp, [esp + 32 + 16] // rgb
+     mov       ecx, [esp + 32 + 20] // width
+     imul      ecx, [esp + 32 + 24] // source_dx
+@@ -438,152 +439,60 @@ lscalelastpixel:
      paddsw    mm1, mm0
      psraw     mm1, 6
      packuswb  mm1, mm1
      movd      [ebp], mm1
      popad
      ret
    };
  }
@@ -2482,41 +3088,50 @@ index b5049a5..627b8cb 100644
 -    psraw     mm1, 6
 -    packuswb  mm1, mm1
 -    mov       eax, rgb_buf
 -    movd      [eax], mm1
 -    emms
 -  }
 -}
 -#endif
--
-+#else // MOZILLA_COMPILE_WITH_SSE2
++#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ 
  void FastConvertYUVToRGB32Row(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width) {
 -  for (int x = 0; x < width; x += 2) {
 -    uint8 u = u_buf[x >> 1];
 -    uint8 v = v_buf[x >> 1];
 -    uint8 y0 = y_buf[x];
 -    YuvPixel(y0, u, v, rgb_buf);
 -    if ((x + 1) < width) {
 -      uint8 y1 = y_buf[x + 1];
 -      YuvPixel(y1, u, v, rgb_buf + 4);
 -    }
 -    rgb_buf += 8;  // Advance 2 pixels.
 -  }
-+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
- }
- 
+-}
+-
 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
 -// A shift by 17 is used to further subsample the chrominence channels.
 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
 -// for 1/65536 pixel accurate interpolation.
++#ifdef MOZILLA_MAY_SUPPORT_SSE
++  if (mozilla::supports_sse()) {
++    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
++    return;
++  }
++#endif
++
++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
  void ScaleYUVToRGB32Row(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* rgb_buf,
                          int width,
                          int source_dx) {
 -  int x = 0;
 -  for (int i = 0; i < width; i += 2) {
@@ -2527,18 +3142,27 @@ index b5049a5..627b8cb 100644
 -    x += source_dx;
 -    if ((i + 1) < width) {
 -      y = y_buf[x >> 16];
 -      YuvPixel(y, u, v, rgb_buf+4);
 -      x += source_dx;
 -    }
 -    rgb_buf += 8;
 -  }
+-}
++
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++  if (mozilla::supports_sse()) {
++    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++    return;
++  }
++#endif
++
 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
- }
++}
  
  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* rgb_buf,
                                int width,
                                int source_dx) {
 -  int x = 0;
@@ -2564,15 +3188,25 @@ index b5049a5..627b8cb 100644
 -      y1 = y_buf[(x >> 16) + 1];
 -      y_frac = (x & 65535);
 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
 -      YuvPixel(y, u, v, rgb_buf+4);
 -      x += source_dx;
 -    }
 -    rgb_buf += 8;
 -  }
-+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
- }
+-}
 -
 -#endif  // USE_MMX
+-}  // extern "C"
+-
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++  if (mozilla::supports_sse()) {
++    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
++                                 source_dx);
++    return;
++  }
 +#endif
- }  // extern "C"
- 
++
++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++
++} // extern "C"
--- a/gfx/ycbcr/update.sh
+++ b/gfx/ycbcr/update.sh
@@ -2,9 +2,8 @@
 cp $1/media/base/yuv_convert.h .
 cp $1/media/base/yuv_convert.cc yuv_convert.cpp
 cp $1/media/base/yuv_row.h .
 cp $1/media/base/yuv_row_table.cc yuv_row_table.cpp
 cp $1/media/base/yuv_row_posix.cc yuv_row_posix.cpp
 cp $1/media/base/yuv_row_win.cc yuv_row_win.cpp
 cp $1/media/base/yuv_row_posix.cc yuv_row_c.cpp
 patch -p3 <convert.patch
-patch -p3 <arm.patch