Bug 616778 - Part 1: Move vectorized code in FilterRows inside CPUID guards. r=joedrew, a=blocking
--- a/gfx/ycbcr/Makefile.in
+++ b/gfx/ycbcr/Makefile.in
@@ -17,16 +17,36 @@ EXPORTS = chromium_types.h \
yuv_row.h \
$(NULL)
CPPSRCS = yuv_convert.cpp \
yuv_row_c.cpp \
yuv_row_table.cpp \
$(NULL)
+# Are we targeting x86 or x64? If so, build yuv_convert_mmx.cpp and
+# yuv_convert_sse2.cpp. These files use MMX and SSE2 intrinsics, so they need
+# special compile flags on some compilers.
+ifneq (,$(INTEL_ARCHITECTURE))
+CPPSRCS += yuv_convert_mmx.cpp \
+ yuv_convert_sse2.cpp
+
+ifdef GNU_CC
+yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -mmmx
+yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -msse2
+endif
+
+ifdef SOLARIS_SUNPRO_CXX
+yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=mmx -xO4
+yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse2 -xO4
+endif
+
+endif
+
+
ifdef _MSC_VER
CPPSRCS += yuv_row_win.cpp \
$(NULL)
else
ifeq ($(OS_ARCH),Linux)
CPPSRCS += yuv_row_posix.cpp \
$(NULL)
else
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -15,18 +15,16 @@
//
// ARGB pixel format is output, which on little endian is stored as BGRA.
// The alpha is set to 255, allowing the application to use RGBA or RGB32.
#include "yuv_convert.h"
// Header for low level row functions.
#include "yuv_row.h"
-#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
-#define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
#include "mozilla/SSE.h"
#ifdef HAVE_YCBCR_TO_RGB565
void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
#endif
namespace mozilla {
@@ -122,87 +120,19 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const
}
}
// MMX used for FastConvertYUVToRGB32Row requires emms instruction.
if (has_sse)
EMMS();
}
-#if defined(MOZILLA_COMPILE_WITH_SSE2)
-// FilterRows combines two rows of the image using linear interpolation.
-// SSE2 version does 16 pixels at a time
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
- int source_width, int source_y_fraction) {
- __m128i zero = _mm_setzero_si128();
- __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
- __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
-
- const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
- const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
- __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
- __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
-
- do {
- __m128i y0 = _mm_loadu_si128(y0_ptr128);
- __m128i y1 = _mm_loadu_si128(y1_ptr128);
- __m128i y2 = _mm_unpackhi_epi8(y0, zero);
- __m128i y3 = _mm_unpackhi_epi8(y1, zero);
- y0 = _mm_unpacklo_epi8(y0, zero);
- y1 = _mm_unpacklo_epi8(y1, zero);
- y0 = _mm_mullo_epi16(y0, y0_fraction);
- y1 = _mm_mullo_epi16(y1, y1_fraction);
- y2 = _mm_mullo_epi16(y2, y0_fraction);
- y3 = _mm_mullo_epi16(y3, y1_fraction);
- y0 = _mm_add_epi16(y0, y1);
- y2 = _mm_add_epi16(y2, y3);
- y0 = _mm_srli_epi16(y0, 8);
- y2 = _mm_srli_epi16(y2, 8);
- y0 = _mm_packus_epi16(y0, y2);
- *dest128++ = y0;
- ++y0_ptr128;
- ++y1_ptr128;
- } while (dest128 < end128);
-}
-#elif defined(MOZILLA_COMPILE_WITH_MMX)
-// MMX version does 8 pixels at a time
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
- int source_width, int source_y_fraction) {
- __m64 zero = _mm_setzero_si64();
- __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
- __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
-
- const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
- const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
- __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
- __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
-
- do {
- __m64 y0 = *y0_ptr64++;
- __m64 y1 = *y1_ptr64++;
- __m64 y2 = _mm_unpackhi_pi8(y0, zero);
- __m64 y3 = _mm_unpackhi_pi8(y1, zero);
- y0 = _mm_unpacklo_pi8(y0, zero);
- y1 = _mm_unpacklo_pi8(y1, zero);
- y0 = _mm_mullo_pi16(y0, y0_fraction);
- y1 = _mm_mullo_pi16(y1, y1_fraction);
- y2 = _mm_mullo_pi16(y2, y0_fraction);
- y3 = _mm_mullo_pi16(y3, y1_fraction);
- y0 = _mm_add_pi16(y0, y1);
- y2 = _mm_add_pi16(y2, y3);
- y0 = _mm_srli_pi16(y0, 8);
- y2 = _mm_srli_pi16(y2, 8);
- y0 = _mm_packs_pu16(y0, y2);
- *dest64++ = y0;
- } while (dest64 < end64);
-}
-#else // no MMX or SSE2
// C version does 8 at a time to mimic MMX code
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
- int source_width, int source_y_fraction) {
+static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
uint8* end = ybuf + source_width;
do {
ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
@@ -210,18 +140,48 @@ static void FilterRows(uint8* ybuf, cons
ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
y0_ptr += 8;
y1_ptr += 8;
ybuf += 8;
} while (ybuf < end);
}
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
#endif
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+#endif
+
+static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
+ const uint8* y1_ptr, int source_width,
+ int source_y_fraction) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+ if (mozilla::supports_sse2()) {
+ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ return;
+ }
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+ if (mozilla::supports_mmx()) {
+ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ return;
+ }
+#endif
+
+ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+}
+
+
// Scale a frame of YUV to 32 bit ARGB.
NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int source_width,
int source_height,
int width,
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <mmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// MMX version does 8 pixels at a time.
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m64 zero = _mm_setzero_si64();
+ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+ do {
+ __m64 y0 = *y0_ptr64++;
+ __m64 y1 = *y1_ptr64++;
+ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+ y0 = _mm_unpacklo_pi8(y0, zero);
+ y1 = _mm_unpacklo_pi8(y1, zero);
+ y0 = _mm_mullo_pi16(y0, y0_fraction);
+ y1 = _mm_mullo_pi16(y1, y1_fraction);
+ y2 = _mm_mullo_pi16(y2, y0_fraction);
+ y3 = _mm_mullo_pi16(y3, y1_fraction);
+ y0 = _mm_add_pi16(y0, y1);
+ y2 = _mm_add_pi16(y2, y3);
+ y0 = _mm_srli_pi16(y0, 8);
+ y2 = _mm_srli_pi16(y2, 8);
+ y0 = _mm_packs_pu16(y0, y2);
+ *dest64++ = y0;
+ } while (dest64 < end64);
+}
+
+}
+}
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <emmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// SSE2 version does 16 pixels at a time.
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+ do {
+ __m128i y0 = _mm_loadu_si128(y0_ptr128);
+ __m128i y1 = _mm_loadu_si128(y1_ptr128);
+ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ y0 = _mm_mullo_epi16(y0, y0_fraction);
+ y1 = _mm_mullo_epi16(y1, y1_fraction);
+ y2 = _mm_mullo_epi16(y2, y0_fraction);
+ y3 = _mm_mullo_epi16(y3, y1_fraction);
+ y0 = _mm_add_epi16(y0, y1);
+ y2 = _mm_add_epi16(y2, y3);
+ y0 = _mm_srli_epi16(y0, 8);
+ y2 = _mm_srli_epi16(y2, 8);
+ y0 = _mm_packus_epi16(y0, y2);
+ *dest128++ = y0;
+ ++y0_ptr128;
+ ++y1_ptr128;
+ } while (dest128 < end128);
+}
+
+}
+}