Bug 619178 - Update gfx/ycbcr to the new SSE.h interface. r=joedrew, a=blocking
authorJustin Lebar <justin.lebar@gmail.com>
Thu, 09 Dec 2010 09:59:21 -0800
changeset 60427 41d1931bca0b2d72cf0c92daaf017ee34eaacc6c
parent 60426 ae853abfbc86a9c94df2fbaadb92fad031fecea1
child 60428 d580ec700a1190dd833b059874c4f9b3374ac064
push id17985
push userjlebar@mozilla.com
push dateThu, 13 Jan 2011 03:11:52 +0000
treeherdermozilla-central@ab3e03c79004 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjoedrew, blocking
bugs619178
milestone2.0b10pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 619178 - Update gfx/ycbcr to the new SSE.h interface. r=joedrew, a=blocking
gfx/ycbcr/yuv_convert.cpp
gfx/ycbcr/yuv_row.h
gfx/ycbcr/yuv_row_posix.cpp
gfx/ycbcr/yuv_row_win.cpp
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -362,45 +362,49 @@ NS_GFX_(void) ScaleYCbCrToRGB32(const ui
       u_ptr = ubuf;
       v_ptr = vbuf;
       ubuf[uv_source_width] = ubuf[uv_source_width - 1];
       vbuf[uv_source_width] = vbuf[uv_source_width - 1];
     }
     if (source_dx == kFractionMax) {  // Not scaled
       FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                                dest_pixel, width);
-    } else {
-      if (filter & FILTER_BILINEAR_H) {
+    } else if (filter & FILTER_BILINEAR_H) {
         LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                                  dest_pixel, width, source_dx);
     } else {
 // Specialized scalers and rotation.
-#if defined(_MSC_VER) && defined(_M_IX86)
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
+      if(mozilla::supports_sse()) {
         if (width == (source_width * 2)) {
-          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                              dest_pixel, width);
+          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                  dest_pixel, width);
         } else if ((source_dx & kFractionMask) == 0) {
           // Scaling by integer scale factor. ie half.
-          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                               dest_pixel, width,
-                               source_dx >> kFractionBits);
+          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, width,
+                                   source_dx >> kFractionBits);
         } else if (source_dx_uv == source_dx) {  // Not rotated.
           ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
                              dest_pixel, width, source_dx);
         } else {
-          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                                     dest_pixel, width,
-                                     source_dx >> kFractionBits,
-                                     source_dx_uv >> kFractionBits);
+          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                         dest_pixel, width,
+                                         source_dx >> kFractionBits,
+                                         source_dx_uv >> kFractionBits);
         }
+      }
+      else {
+        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                             dest_pixel, width, source_dx);
+      }
 #else
-        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
-                           dest_pixel, width, source_dx);
+      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                         dest_pixel, width, source_dx);
 #endif
-      }      
     }
   }
   // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
   if (has_mmx)
     EMMS();
 }
 
 }  // namespace gfx
--- a/gfx/ycbcr/yuv_row.h
+++ b/gfx/ycbcr/yuv_row.h
@@ -32,41 +32,44 @@ void FastConvertYUVToRGB32Row(const uint
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width);
 
 // Can do 1x, half size or any scale down by an integer amount.
 // Step can be negative (mirroring, rotate 180).
 // This is the third fastest of the scalers.
-void ConvertYUVToRGB32Row(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width,
-                          int step);
+// Only defined on Windows x86-32.
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int step);
 
 // Rotate is like Convert, but applies different step to Y versus U and V.
 // This allows rotation by 90 or 270, by stepping by stride.
 // This is the forth fastest of the scalers.
-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* rgb_buf,
-                                int width,
-                                int ystep,
-                                int uvstep);
+// Only defined on Windows x86-32.
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int ystep,
+                                    int uvstep);
 
 // Doubler does 4 pixels at a time.  Each pixel is replicated.
 // This is the fastest of the scalers.
-void DoubleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width);
+// Only defined on Windows x86-32.
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
 
 // Handles arbitrary scaling up or down.
 // Mirroring is supported, but not 90 or 270 degree rotation.
 // Chroma is under sampled every 2 pixels for performance.
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
--- a/gfx/ycbcr/yuv_row_posix.cpp
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -1,20 +1,23 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #include "yuv_row.h"
+#include "mozilla/SSE.h"
 
 #define DCHECK(a)
 
 extern "C" {
 
 #if defined(ARCH_CPU_X86_64)
 
+// We don't need CPUID guards here, since x86-64 implies SSE2.
+
 // AMD64 ABI uses register paremters.
 void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
                               const uint8* u_buf,  // rsi
                               const uint8* v_buf,  // rdx
                               uint8* rgb_buf,      // rcx
                               int width) {         // r8
   asm(
   "jmp    1f\n"
@@ -250,30 +253,30 @@ void LinearScaleYUVToRGB32Row(const uint
     "r"(rgb_buf),  // %3
     "r"(width),  // %4
     "r" (kCoefficientsRgbY),  // %5
     "r"(static_cast<long>(source_dx))  // %6
   : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
 );
 }
 
-#elif defined(ARCH_CPU_X86_32) && !defined(__PIC__)
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
 
 // PIC version is slower because less registers are available, so
 // non-PIC is used on platforms where it is possible.
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
   asm(
   ".text\n"
-  ".global FastConvertYUVToRGB32Row\n"
-  ".type FastConvertYUVToRGB32Row, @function\n"
-"FastConvertYUVToRGB32Row:\n"
+  ".global FastConvertYUVToRGB32Row_SSE\n"
+  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
+"FastConvertYUVToRGB32Row_SSE:\n"
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x34(%esp),%ecx\n"
   "jmp    1f\n"
 
@@ -316,28 +319,42 @@ void FastConvertYUVToRGB32Row(const uint
 "2:"
   "popa\n"
   "ret\n"
 #if !defined(XP_MACOSX)
   ".previous\n"
 #endif
 );
 
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width)
+{
+  if (mozilla::supports_sse()) {
+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+    return;
+  }
 
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx);
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx);
   asm(
   ".text\n"
-  ".global ScaleYUVToRGB32Row\n"
-  ".type ScaleYUVToRGB32Row, @function\n"
-"ScaleYUVToRGB32Row:\n"
+  ".global ScaleYUVToRGB32Row_SSE\n"
+  ".type ScaleYUVToRGB32Row_SSE, @function\n"
+"ScaleYUVToRGB32Row_SSE:\n"
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x34(%esp),%ecx\n"
   "xor    %ebx,%ebx\n"
   "jmp    1f\n"
@@ -395,27 +412,43 @@ void ScaleYUVToRGB32Row(const uint8* y_b
 "2:"
   "popa\n"
   "ret\n"
 #if !defined(XP_MACOSX)
   ".previous\n"
 #endif
 );
 
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx);
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+                           width, source_dx);
+  }
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+                       width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx);
   asm(
   ".text\n"
-  ".global LinearScaleYUVToRGB32Row\n"
-  ".type LinearScaleYUVToRGB32Row, @function\n"
-"LinearScaleYUVToRGB32Row:\n"
+  ".global LinearScaleYUVToRGB32Row_SSE\n"
+  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
+"LinearScaleYUVToRGB32Row_SSE:\n"
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x30(%esp),%ebp\n"
 
   // source_width = width * source_dx + ebx
   "mov    0x34(%esp), %ecx\n"
   "imull  0x38(%esp), %ecx\n"
@@ -510,31 +543,47 @@ void LinearScaleYUVToRGB32Row(const uint
   "movd %mm1, (%ebp)\n"
   "popa\n"
   "ret\n"
 #if !defined(XP_MACOSX)
   ".previous\n"
 #endif
 );
 
-#elif defined(ARCH_CPU_X86_32) && defined(__PIC__)
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+                                 width, source_dx);
+  }
 
-void PICConvertYUVToRGB32Row(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width,
-                             int16 *kCoefficientsRgbY);
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+                             width, source_dx);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
+
+void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+                                 int16 *kCoefficientsRgbY);
 
   asm(
   ".text\n"
 #if defined(XP_MACOSX)
-"_PICConvertYUVToRGB32Row:\n"
+"_PICConvertYUVToRGB32Row_SSE:\n"
 #else
-"PICConvertYUVToRGB32Row:\n"
+"PICConvertYUVToRGB32Row_SSE:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x38(%esp),%ecx\n"
 
@@ -583,35 +632,41 @@ void PICConvertYUVToRGB32Row(const uint8
   ".previous\n"
 #endif
 );
 
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
-                              int width) {
-  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
-                          &kCoefficientsRgbY[0][0]);
+                              int width)
+{
+  if (mozilla::supports_sse()) {
+    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
 
-void PICScaleYUVToRGB32Row(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb_buf,
-                           int width,
-                           int source_dx,
-                           int16 *kCoefficientsRgbY);
+void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width,
+                               int source_dx,
+                               int16 *kCoefficientsRgbY);
 
   asm(
   ".text\n"
 #if defined(XP_MACOSX)
-"_PICScaleYUVToRGB32Row:\n"
+"_PICScaleYUVToRGB32Row_SSE:\n"
 #else
-"PICScaleYUVToRGB32Row:\n"
+"PICScaleYUVToRGB32Row_SSE:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x28(%esp),%edi\n"
   "mov    0x2c(%esp),%esi\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x3c(%esp),%ecx\n"
   "xor    %ebx,%ebx\n"
@@ -675,35 +730,41 @@ void PICScaleYUVToRGB32Row(const uint8* 
 #endif
 );
 
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width,
-                        int source_dx) {
-  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                        &kCoefficientsRgbY[0][0]);
+                        int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+                              &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 }
 
-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width,
-                                 int source_dx,
-                                 int16 *kCoefficientsRgbY);
+void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     uint8* rgb_buf,
+                                     int width,
+                                     int source_dx,
+                                     int16 *kCoefficientsRgbY);
 
   asm(
   ".text\n"
 #if defined(XP_MACOSX)
-"_PICLinearScaleYUVToRGB32Row:\n"
+"_PICLinearScaleYUVToRGB32Row_SSE:\n"
 #else
-"PICLinearScaleYUVToRGB32Row:\n"
+"PICLinearScaleYUVToRGB32Row_SSE:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
   "mov    0x30(%esp),%ebp\n"
   "mov    0x34(%esp),%ecx\n"
   "mov    0x3c(%esp),%edi\n"
   "xor    %ebx,%ebx\n"
 
@@ -808,19 +869,25 @@ void PICLinearScaleYUVToRGB32Row(const u
 );
 
 
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
-                              int source_dx) {
-  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
-                              &kCoefficientsRgbY[0][0]);
+                              int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                    source_dx, &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 }
 #else
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
@@ -840,10 +907,9 @@ void LinearScaleYUVToRGB32Row(const uint
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx) {
   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 }
 #endif
 
-}  // extern "C"
-
+}
--- a/gfx/ycbcr/yuv_row_win.cpp
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -1,29 +1,27 @@
 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #include "yuv_row.h"
-#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
-#define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
 #include "mozilla/SSE.h"
 
-
 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
 
 extern "C" {
-#if defined(MOZILLA_COMPILE_WITH_SSE2) && defined(_M_IX86)
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
 __declspec(naked)
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     jmp       convertend
@@ -67,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
  convertdone :
 
     popad
     ret
   }
 }
 
 __declspec(naked)
-void ConvertYUVToRGB32Row(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width,
-                          int step) {
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int step) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     mov       ebx, [esp + 32 + 24]  // step
@@ -128,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
  wdone :
 
     popad
     ret
   }
 }
 
 __declspec(naked)
-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* rgb_buf,
-                                int width,
-                                int ystep,
-                                int uvstep) {
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int ystep,
+                                    int uvstep) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     jmp       wend
@@ -191,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
  wdone :
 
     popad
     ret
   }
 }
 
 __declspec(naked)
-void DoubleYUVToRGB32Row(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width) {
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     jmp       wend
@@ -259,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
     jns       wloop1
  wdone :
     popad
     ret
   }
 }
 
 // This version does general purpose scaling by any amount, up or down.
-// The only thing it can not do it rotation by 90 or 270.
-// For performance the chroma is under sampled, reducing cost of a 3x
+// The only thing it cannot do is rotation by 90 or 270.
+// For performance the chroma is under-sampled, reducing cost of a 3x
 // 1080p scale from 8.4 ms to 5.4 ms.
 __declspec(naked)
-void ScaleYUVToRGB32Row(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width,
-                        int source_dx) {
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]   // Y
     mov       edi, [esp + 32 + 8]   // U
     mov       esi, [esp + 32 + 12]  // V
     mov       ebp, [esp + 32 + 16]  // rgb
     mov       ecx, [esp + 32 + 20]  // width
     xor       ebx, ebx              // x
@@ -336,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
 
  scaledone :
     popad
     ret
   }
 }
 
 __declspec(naked)
-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width,
-                              int source_dx) {
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx) {
   __asm {
     pushad
     mov       edx, [esp + 32 + 4]  // Y
     mov       edi, [esp + 32 + 8]  // U
                 // [esp + 32 + 12] // V
     mov       ebp, [esp + 32 + 16] // rgb
     mov       ecx, [esp + 32 + 20] // width
     imul      ecx, [esp + 32 + 24] // source_dx
@@ -441,37 +439,60 @@ lscalelastpixel:
     paddsw    mm1, mm0
     psraw     mm1, 6
     packuswb  mm1, mm1
     movd      [ebp], mm1
     popad
     ret
   };
 }
-#else // MOZILLA_COMPILE_WITH_SSE2
+#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE
+  if (mozilla::supports_sse()) {
+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+    return;
+  }
+#endif
+
   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 }
 
 void ScaleYUVToRGB32Row(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width,
                         int source_dx) {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+    return;
+  }
+#endif
+
   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 }
 
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                 source_dx);
+    return;
+  }
+#endif
+
   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 }
-#endif
-}  // extern "C"
 
+} // extern "C"