Bug 616778 - Part 1: Move vectorized code in FilterRows inside CPUID guards. r=joedrew, a=blocking
authorJustin Lebar <justin.lebar@gmail.com>
Thu, 09 Dec 2010 09:59:21 -0800
changeset 60428 d580ec700a1190dd833b059874c4f9b3374ac064
parent 60427 41d1931bca0b2d72cf0c92daaf017ee34eaacc6c
child 60429 ab3e03c79004e2925c2868a412e525ecdd2f342e
push id1
push userroot
push dateMon, 20 Oct 2014 17:29:22 +0000
reviewersjoedrew, blocking
bugs616778
milestone2.0b10pre
Bug 616778 - Part 1: Move vectorized code in FilterRows inside CPUID guards. r=joedrew, a=blocking
gfx/ycbcr/Makefile.in
gfx/ycbcr/yuv_convert.cpp
gfx/ycbcr/yuv_convert_mmx.cpp
gfx/ycbcr/yuv_convert_sse2.cpp
--- a/gfx/ycbcr/Makefile.in
+++ b/gfx/ycbcr/Makefile.in
@@ -17,16 +17,36 @@ EXPORTS      = chromium_types.h \
                yuv_row.h \
                $(NULL)
 
 CPPSRCS = yuv_convert.cpp \
           yuv_row_c.cpp \
           yuv_row_table.cpp \
           $(NULL)
 
+# Are we targeting x86 or x64?  If so, build yuv_convert_mmx.cpp and
+# yuv_convert_sse2.cpp. These files use MMX and SSE2 intrinsics, so they need
+# special compile flags on some compilers.
+ifneq (,$(INTEL_ARCHITECTURE))
+CPPSRCS += yuv_convert_mmx.cpp \
+           yuv_convert_sse2.cpp
+
+ifdef GNU_CC
+yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -mmmx
+yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -msse2
+endif
+
+ifdef SOLARIS_SUNPRO_CXX
+yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=mmx -xO4
+yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse2 -xO4
+endif
+
+endif
+
+
 ifdef _MSC_VER
 CPPSRCS += yuv_row_win.cpp \
            $(NULL)
 else
 ifeq ($(OS_ARCH),Linux)
 CPPSRCS += yuv_row_posix.cpp \
            $(NULL)
 else
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -15,18 +15,16 @@
 //
 // ARGB pixel format is output, which on little endian is stored as BGRA.
 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
 
 #include "yuv_convert.h"
 
 // Header for low level row functions.
 #include "yuv_row.h"
-#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
-#define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
 #include "mozilla/SSE.h"
 
 #ifdef HAVE_YCBCR_TO_RGB565
 void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
 #endif
 
 namespace mozilla {
 
@@ -122,87 +120,19 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const 
     }
   }
 
   // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
   if (has_sse)
     EMMS();
 }
 
-#if defined(MOZILLA_COMPILE_WITH_SSE2)
-// FilterRows combines two rows of the image using linear interpolation.
-// SSE2 version does 16 pixels at a time
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                       int source_width, int source_y_fraction) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
-  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
-
-  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
-  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
-  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
-  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
-
-  do {
-    __m128i y0 = _mm_loadu_si128(y0_ptr128);
-    __m128i y1 = _mm_loadu_si128(y1_ptr128);
-    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
-    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
-    y0 = _mm_unpacklo_epi8(y0, zero);
-    y1 = _mm_unpacklo_epi8(y1, zero);
-    y0 = _mm_mullo_epi16(y0, y0_fraction);
-    y1 = _mm_mullo_epi16(y1, y1_fraction);
-    y2 = _mm_mullo_epi16(y2, y0_fraction);
-    y3 = _mm_mullo_epi16(y3, y1_fraction);
-    y0 = _mm_add_epi16(y0, y1);
-    y2 = _mm_add_epi16(y2, y3);
-    y0 = _mm_srli_epi16(y0, 8);
-    y2 = _mm_srli_epi16(y2, 8);
-    y0 = _mm_packus_epi16(y0, y2);
-    *dest128++ = y0;
-    ++y0_ptr128;
-    ++y1_ptr128;
-  } while (dest128 < end128);
-}
-#elif defined(MOZILLA_COMPILE_WITH_MMX)
-// MMX version does 8 pixels at a time
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                       int source_width, int source_y_fraction) {
-  __m64 zero = _mm_setzero_si64();
-  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
-  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
-
-  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
-  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
-  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
-  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
-
-  do {
-    __m64 y0 = *y0_ptr64++;
-    __m64 y1 = *y1_ptr64++;
-    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
-    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
-    y0 = _mm_unpacklo_pi8(y0, zero);
-    y1 = _mm_unpacklo_pi8(y1, zero);
-    y0 = _mm_mullo_pi16(y0, y0_fraction);
-    y1 = _mm_mullo_pi16(y1, y1_fraction);
-    y2 = _mm_mullo_pi16(y2, y0_fraction);
-    y3 = _mm_mullo_pi16(y3, y1_fraction);
-    y0 = _mm_add_pi16(y0, y1);
-    y2 = _mm_add_pi16(y2, y3);
-    y0 = _mm_srli_pi16(y0, 8);
-    y2 = _mm_srli_pi16(y2, 8);
-    y0 = _mm_packs_pu16(y0, y2);
-    *dest64++ = y0;
-  } while (dest64 < end64);
-}
-#else  // no MMX or SSE2
 // C version does 8 at a time to mimic MMX code
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
-                       int source_width, int source_y_fraction) {
+static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                         int source_width, int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   uint8* end = ybuf + source_width;
   do {
     ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
     ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
     ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
     ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
@@ -210,18 +140,48 @@ static void FilterRows(uint8* ybuf, cons
     ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
     ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
     ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
     y0_ptr += 8;
     y1_ptr += 8;
     ybuf += 8;
   } while (ybuf < end);
 }
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                    int source_width, int source_y_fraction);
 #endif
 
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction);
+#endif
+
+static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
+                              const uint8* y1_ptr, int source_width,
+                              int source_y_fraction) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+  if (mozilla::supports_sse2()) {
+    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+    return;
+  }
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+  if (mozilla::supports_mmx()) {
+    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+    return;
+  }
+#endif
+
+  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+}
+
+
 // Scale a frame of YUV to 32 bit ARGB.
 NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int source_width,
                                 int source_height,
                                 int width,
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <mmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// MMX version does 8 pixels at a time.
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                    int source_width, int source_y_fraction) {
+  __m64 zero = _mm_setzero_si64();
+  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+  do {
+    __m64 y0 = *y0_ptr64++;
+    __m64 y1 = *y1_ptr64++;
+    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+    y0 = _mm_unpacklo_pi8(y0, zero);
+    y1 = _mm_unpacklo_pi8(y1, zero);
+    y0 = _mm_mullo_pi16(y0, y0_fraction);
+    y1 = _mm_mullo_pi16(y1, y1_fraction);
+    y2 = _mm_mullo_pi16(y2, y0_fraction);
+    y3 = _mm_mullo_pi16(y3, y1_fraction);
+    y0 = _mm_add_pi16(y0, y1);
+    y2 = _mm_add_pi16(y2, y3);
+    y0 = _mm_srli_pi16(y0, 8);
+    y2 = _mm_srli_pi16(y2, 8);
+    y0 = _mm_packs_pu16(y0, y2);
+    *dest64++ = y0;
+  } while (dest64 < end64);
+}
+
+}
+}
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <emmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// SSE2 version does 16 pixels at a time.
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+  do {
+    __m128i y0 = _mm_loadu_si128(y0_ptr128);
+    __m128i y1 = _mm_loadu_si128(y1_ptr128);
+    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+    y0 = _mm_unpacklo_epi8(y0, zero);
+    y1 = _mm_unpacklo_epi8(y1, zero);
+    y0 = _mm_mullo_epi16(y0, y0_fraction);
+    y1 = _mm_mullo_epi16(y1, y1_fraction);
+    y2 = _mm_mullo_epi16(y2, y0_fraction);
+    y3 = _mm_mullo_epi16(y3, y1_fraction);
+    y0 = _mm_add_epi16(y0, y1);
+    y2 = _mm_add_epi16(y2, y3);
+    y0 = _mm_srli_epi16(y0, 8);
+    y2 = _mm_srli_epi16(y2, 8);
+    y0 = _mm_packus_epi16(y0, y2);
+    *dest128++ = y0;
+    ++y0_ptr128;
+    ++y1_ptr128;
+  } while (dest128 < end128);
+}
+
+}
+}