Bug 571739 - Use SIMD for conversion from RGB to YUV on Win64. r=kinetik
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>
Mon, 18 Apr 2011 09:51:18 +0900
changeset 68500 0eaee388eeadf2da8850c88f4bf3d31673fa255b
parent 68499 d29e9cb9d0c940a6e897e66db6fb849dd887835b
child 68501 c062731105cf04ef462a78debeaf00a4e492167d
push id19663
push userm_kato@ga2.so-net.ne.jp
push dateMon, 25 Apr 2011 04:49:30 +0000
treeherdermozilla-central@c062731105cf [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskinetik
bugs571739
milestone6.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 571739 - Use SIMD for conversion from RGB to YUV on Win64. r=kinetik
gfx/ycbcr/Makefile.in
gfx/ycbcr/README
gfx/ycbcr/update.sh
gfx/ycbcr/win64.patch
gfx/ycbcr/yuv_row_win64.cpp
--- a/gfx/ycbcr/Makefile.in
+++ b/gfx/ycbcr/Makefile.in
@@ -48,18 +48,23 @@ ifdef SOLARIS_SUNPRO_CXX
 yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=mmx -xO4
 yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse2 -xO4
 endif
 
 endif
 
 
 ifdef _MSC_VER
+ifeq ($(OS_TEST),x86_64)
+CPPSRCS += yuv_row_win64.cpp \
+           $(NULL)
+else
 CPPSRCS += yuv_row_win.cpp \
            $(NULL)
+endif
 else
 ifeq ($(OS_ARCH),Linux)
 CPPSRCS += yuv_row_posix.cpp \
            $(NULL)
 else
 ifeq ($(OS_ARCH),SunOS)
 CPPSRCS += yuv_row_posix.cpp \
            $(NULL)
--- a/gfx/ycbcr/README
+++ b/gfx/ycbcr/README
@@ -16,8 +16,10 @@ convert.patch contains the following cha
   * Add runtime CPU detection for MMX
   * Move default C implementation to work on all platforms.
   * Change Chromium code to allow a picture region.
   * The YUV conversion will convert within this picture region only.
   * Add YCbCr 4:4:4 support
   * Bug 619178 - Update CPU detection in yuv_convert to new SSE.h interface.
   * Bug 616778 - Split yuv_convert FilterRows vectorized code into separate files so it can
     be properly guarded with cpuid() calls.
+
+win64.patch: SSE2 optimization for Microsoft Visual C++ x64 version
--- a/gfx/ycbcr/update.sh
+++ b/gfx/ycbcr/update.sh
@@ -2,8 +2,9 @@
 cp $1/media/base/yuv_convert.h .
 cp $1/media/base/yuv_convert.cc yuv_convert.cpp
 cp $1/media/base/yuv_row.h .
 cp $1/media/base/yuv_row_table.cc yuv_row_table.cpp
 cp $1/media/base/yuv_row_posix.cc yuv_row_posix.cpp
 cp $1/media/base/yuv_row_win.cc yuv_row_win.cpp
 cp $1/media/base/yuv_row_posix.cc yuv_row_c.cpp
 patch -p3 <convert.patch
+patch -p3 <win64.patch
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/win64.patch
@@ -0,0 +1,210 @@
+diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_row_win64.cpp
+@@ -0,0 +1,205 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++#include "yuv_row.h"
++
++extern "C" {
++
++// x64 compiler does'nt support MMX and inline assembler.  Use SSE2 intrin.
++
++#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
++#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
++
++#include <emmintrin.h>
++
++static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
++                                          const uint8* u_buf,
++                                          const uint8* v_buf,
++                                          uint8* rgb_buf,
++                                          int width) {
++  __m128i xmm0, xmmY1, xmmY2;
++  __m128  xmmY;
++
++  while (width >= 2) {
++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
++
++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++
++    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
++    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
++
++    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
++                          0x44);
++    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++
++    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
++    rgb_buf += 8;
++    width -= 2;
++  }
++
++  if (width) {
++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++    xmmY1 = _mm_srai_epi16(xmmY1, 6);
++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
++  }
++}
++
++static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
++                                    const uint8* u_buf,
++                                    const uint8* v_buf,
++                                    uint8* rgb_buf,
++                                    int width,
++                                    int source_dx) {
++  __m128i xmm0, xmmY1, xmmY2;
++  __m128  xmmY;
++  uint8 u, v, y;
++  int x = 0;
++
++  while (width >= 2) {
++    u = u_buf[x >> 17];
++    v = v_buf[x >> 17];
++    y = y_buf[x >> 16];
++    x += source_dx;
++
++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++
++    y = y_buf[x >> 16];
++    x += source_dx;
++
++    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
++
++    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
++                          0x44);
++    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++
++    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
++    rgb_buf += 8;
++    width -= 2;
++  }
++
++  if (width) {
++    u = u_buf[x >> 17];
++    v = v_buf[x >> 17];
++    y = y_buf[x >> 16];
++
++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++    xmmY1 = _mm_srai_epi16(xmmY1, 6);
++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
++  }
++}
++
++static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
++                                          const uint8* u_buf,
++                                          const uint8* v_buf,
++                                          uint8* rgb_buf,
++                                          int width,
++                                          int source_dx) {
++  __m128i xmm0, xmmY1, xmmY2;
++  __m128  xmmY;
++  uint8 u0, u1, v0, v1, y0, y1;
++  uint32 uv_frac, y_frac, u, v, y;
++  int x = 0;
++
++  if (source_dx >= 0x20000) {
++    x = 32768;
++  }
++
++  while(width >= 2) {
++    u0 = u_buf[x >> 17];
++    u1 = u_buf[(x >> 17) + 1];
++    v0 = v_buf[x >> 17];
++    v1 = v_buf[(x >> 17) + 1];
++    y0 = y_buf[x >> 16];
++    y1 = y_buf[(x >> 16) + 1];
++    uv_frac = (x & 0x1fffe);
++    y_frac = (x & 0xffff);
++    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
++    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
++    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
++    x += source_dx;
++
++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++
++    y0 = y_buf[x >> 16];
++    y1 = y_buf[(x >> 16) + 1];
++    y_frac = (x & 0xffff);
++    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
++    x += source_dx;
++
++    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
++
++    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
++                          0x44);
++    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++
++    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
++    rgb_buf += 8;
++    width -= 2;
++  }
++
++  if (width) {
++    u = u_buf[x >> 17];
++    v = v_buf[x >> 17];
++    y = y_buf[x >> 16];
++
++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++
++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++    xmmY1 = _mm_srai_epi16(xmmY1, 6);
++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
++  }
++}
++
++void FastConvertYUVToRGB32Row(const uint8* y_buf,
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width) {
++  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
++}
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++                        const uint8* u_buf,
++                        const uint8* v_buf,
++                        uint8* rgb_buf,
++                        int width,
++                        int source_dx) {
++  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++
++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
++                              const uint8* u_buf,
++                              const uint8* v_buf,
++                              uint8* rgb_buf,
++                              int width,
++                              int source_dx) {
++  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
++                                source_dx);
++}
++
++} // extern "C"
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win64.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
+
+#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
+#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
+
+#include <emmintrin.h>
+
+static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgb_buf,
+                                          int width) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+
+  while (width >= 2) {
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
+
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int source_dx) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+  uint8 u, v, y;
+  int x = 0;
+
+  while (width >= 2) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+    x += source_dx;
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    y = y_buf[x >> 16];
+    x += source_dx;
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgb_buf,
+                                          int width,
+                                          int source_dx) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+  uint8 u0, u1, v0, v1, y0, y1;
+  uint32 uv_frac, y_frac, u, v, y;
+  int x = 0;
+
+  if (source_dx >= 0x20000) {
+    x = 32768;
+  }
+
+  while(width >= 2) {
+    u0 = u_buf[x >> 17];
+    u1 = u_buf[(x >> 17) + 1];
+    v0 = v_buf[x >> 17];
+    v1 = v_buf[(x >> 17) + 1];
+    y0 = y_buf[x >> 16];
+    y1 = y_buf[(x >> 16) + 1];
+    uv_frac = (x & 0x1fffe);
+    y_frac = (x & 0xffff);
+    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
+    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+    x += source_dx;
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    y0 = y_buf[x >> 16];
+    y1 = y_buf[(x >> 16) + 1];
+    y_frac = (x & 0xffff);
+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+    x += source_dx;
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx) {
+  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
+                                source_dx);
+}
+
+} // extern "C"