Bug 616469 - Video sync is slow because of slow yuv2rgb conversion. README and patch r=doublec a=blocking-fennec
authorOleg Romashin <romaxa@gmail.com>
Wed, 15 Dec 2010 10:17:26 -0800
changeset 59200 513cc1daf58f5bc303e89b8964f59184c681f305
parent 59199 fae6da8a664c6d4c73df880c8e1af5aa1dec6993
child 59201 aca204f42affc03c0a300253c5df200062ee7e8b
push idunknown
push userunknown
push dateunknown
reviewersdoublec, blocking-fennec
bugs616469
milestone2.0b9pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 616469 - Video sync is slow because of slow yuv2rgb conversion. README and patch r=doublec a=blocking-fennec
gfx/ycbcr/README
gfx/ycbcr/arm.patch
gfx/ycbcr/update.sh
--- a/gfx/ycbcr/README
+++ b/gfx/ycbcr/README
@@ -8,8 +8,10 @@ The code was copied from a Chromium svn 
 
 convert.patch: Change Chromium code to build using Mozilla build system.
                Add runtime CPU detection for MMX
                Move default C implementation to work on all platforms.
                Change Chromium code to allow a picture region.
                The YUV conversion will convert within this 
                picture region only.
                Add YCbCr 4:4:4 support
+
+arm.patch: Add YCbCr to rgb16_565 conversion support, bug 616469
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/arm.patch
@@ -0,0 +1,347 @@
+diff --git a/gfx/ycbcr/Makefile.in b/gfx/ycbcr/Makefile.in
+--- a/gfx/ycbcr/Makefile.in
++++ b/gfx/ycbcr/Makefile.in
+@@ -40,16 +40,21 @@ CPPSRCS += yuv_row_posix.cpp \
+ else
+ CPPSRCS += yuv_row_other.cpp \
+            $(NULL)
+ endif # Darwin
+ endif # SunOS
+ endif # linux
+ endif # windows
+ 
++ifeq (arm,$(findstring arm,$(OS_TEST)))
++CPPSRCS += yuv_convert_arm.cpp \
++           $(NULL)
++endif
++
+ EXTRA_DSO_LDOPTS += \
+         $(LIBS_DIR) \
+         $(EXTRA_DSO_LIBS) \
+         $(XPCOM_LIBS) \
+         $(NSPR_LIBS) \
+         $(NULL)
+ 
+ include $(topsrcdir)/config/rules.mk
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -19,25 +19,56 @@
+ #include "yuv_convert.h"
+ 
+ // Header for low level row functions.
+ #include "yuv_row.h"
+ #define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
+ #define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
+ #include "mozilla/SSE.h"
+ 
++#ifdef HAVE_YCBCR_TO_RGB565
++void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
++#endif
++
+ namespace mozilla {
+ 
+ namespace gfx {
+  
+ // 16.16 fixed point arithmetic
+ const int kFractionBits = 16;
+ const int kFractionMax = 1 << kFractionBits;
+ const int kFractionMask = ((1 << kFractionBits) - 1);
+ 
++
++// Convert a frame of YUV to 16 bit RGB565.
++NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
++                                  const uint8* u_buf,
++                                  const uint8* v_buf,
++                                  uint8* rgb_buf,
++                                  int pic_x,
++                                  int pic_y,
++                                  int pic_width,
++                                  int pic_height,
++                                  int y_pitch,
++                                  int uv_pitch,
++                                  int rgb_pitch,
++                                  YUVType yuv_type)
++{
++#ifdef HAVE_YCBCR_TO_RGB565
++  for (int i = 0; i < pic_height; i++) {
++    yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
++                         y_buf + y_pitch * i,
++                         u_buf + uv_pitch * (i / 2),
++                         v_buf + uv_pitch * (i / 2),
++                         pic_width,
++                         0);
++  }
++#endif
++}
++
+ // Convert a frame of YUV to 32 bit ARGB.
+ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int pic_x,
+                                   int pic_y,
+                                   int pic_width,
+diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
+--- a/gfx/ycbcr/yuv_convert.h
++++ b/gfx/ycbcr/yuv_convert.h
+@@ -2,16 +2,20 @@
+ // Use of this source code is governed by a BSD-style license that can be
+ // found in the LICENSE file.
+ 
+ #ifndef MEDIA_BASE_YUV_CONVERT_H_
+ #define MEDIA_BASE_YUV_CONVERT_H_
+ 
+ #include "chromium_types.h"
+ #include "gfxCore.h"
++
++#ifdef __arm__
++#define HAVE_YCBCR_TO_RGB565 1
++#endif
+  
+ namespace mozilla {
+ 
+ namespace gfx {
+  
+ // Type of YUV surface.
+ // The value of these enums matter as they are used to shift vertical indices.
+ enum YUVType {
+@@ -36,16 +40,31 @@ enum Rotate {
+ // Filter affects how scaling looks.
+ enum ScaleFilter {
+   FILTER_NONE = 0,        // No filter (point sampled).
+   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
+   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
+   FILTER_BILINEAR = 3     // Bilinear filter.
+ };
+ 
++// Convert a frame of YUV to 16 bit RGB565.
++// Pass in YV12 formats
++NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
++                                  const uint8* uplane,
++                                  const uint8* vplane,
++                                  uint8* rgbframe,
++                                  int pic_x,
++                                  int pic_y,
++                                  int pic_width,
++                                  int pic_height,
++                                  int ystride,
++                                  int uvstride,
++                                  int rgbstride,
++                                  YUVType yuv_type);
++
+ // Convert a frame of YUV to 32 bit ARGB.
+ // Pass in YV16/YV12 depending on source format
+ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
+                                   const uint8* uplane,
+                                   const uint8* vplane,
+                                   uint8* rgbframe,
+                                   int pic_x,
+                                   int pic_y,
+diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_convert_arm.cpp
+@@ -0,0 +1,201 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
++
++#include "yuv_convert.h"
++
++void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag)
++{
++    static __attribute__((aligned(16))) uint16 acc_r[8] = {
++        22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
++    };
++    static __attribute__((aligned(16))) uint16 acc_g[8] = {
++        17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
++    };
++    static __attribute__((aligned(16))) uint16 acc_b[8] = {
++        28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
++    };
++    /*
++     * Registers:
++     * q0, q1 : d0, d1, d2, d3  - are used for initial loading of YUV data
++     * q2     : d4, d5          - are used for storing converted RGB data
++     * q3     : d6, d7          - are used for temporary storage
++     *
++     * q4-q7 - reserved
++     *
++     * q8, q9 : d16, d17, d18, d19  - are used for expanded Y data
++     * q10    : d20, d21
++     * q11    : d22, d23
++     * q12    : d24, d25
++     * q13    : d26, d27
++     * q13, q14, q15            - various constants (#16, #149, #204, #50, #104, #154)
++     */
++    asm volatile (
++".fpu neon\n"
++".macro convert_macroblock size\n"
++/* load up to 16 source pixels */
++	".if \\size == 16\n"
++	    "pld [%[y], #64]\n"
++	    "pld [%[u], #64]\n"
++	    "pld [%[v], #64]\n"
++	    "vld1.8 {d1}, [%[y]]!\n"
++	    "vld1.8 {d3}, [%[y]]!\n"
++	    "vld1.8 {d0}, [%[u]]!\n"
++	    "vld1.8 {d2}, [%[v]]!\n"
++	".elseif \\size == 8\n"
++	    "vld1.8 {d1}, [%[y]]!\n"
++	    "vld1.8 {d0[0]}, [%[u]]!\n"
++	    "vld1.8 {d0[1]}, [%[u]]!\n"
++	    "vld1.8 {d0[2]}, [%[u]]!\n"
++	    "vld1.8 {d0[3]}, [%[u]]!\n"
++	    "vld1.8 {d2[0]}, [%[v]]!\n"
++	    "vld1.8 {d2[1]}, [%[v]]!\n"
++	    "vld1.8 {d2[2]}, [%[v]]!\n"
++	    "vld1.8 {d2[3]}, [%[v]]!\n"
++	".elseif \\size == 4\n"
++	    "vld1.8 {d1[0]}, [%[y]]!\n"
++	    "vld1.8 {d1[1]}, [%[y]]!\n"
++	    "vld1.8 {d1[2]}, [%[y]]!\n"
++	    "vld1.8 {d1[3]}, [%[y]]!\n"
++	    "vld1.8 {d0[0]}, [%[u]]!\n"
++	    "vld1.8 {d0[1]}, [%[u]]!\n"
++	    "vld1.8 {d2[0]}, [%[v]]!\n"
++	    "vld1.8 {d2[1]}, [%[v]]!\n"
++	".elseif \\size == 2\n"
++	    "vld1.8 {d1[0]}, [%[y]]!\n"
++	    "vld1.8 {d1[1]}, [%[y]]!\n"
++	    "vld1.8 {d0[0]}, [%[u]]!\n"
++	    "vld1.8 {d2[0]}, [%[v]]!\n"
++	".elseif \\size == 1\n"
++	    "vld1.8 {d1[0]}, [%[y]]!\n"
++	    "vld1.8 {d0[0]}, [%[u]]!\n"
++	    "vld1.8 {d2[0]}, [%[v]]!\n"
++	".else\n"
++	    ".error \"unsupported macroblock size\"\n"
++	".endif\n"
++
++        /* d1 - Y data (first 8 bytes) */
++        /* d3 - Y data (next 8 bytes) */
++        /* d0 - U data, d2 - V data */
++
++	/* split even and odd Y color components */
++	"vuzp.8      d1, d3\n"                       /* d1 - evenY, d3 - oddY */
++	/* clip upper and lower boundaries */
++	"vqadd.u8    q0, q0, q4\n"
++	"vqadd.u8    q1, q1, q4\n"
++	"vqsub.u8    q0, q0, q5\n"
++	"vqsub.u8    q1, q1, q5\n"
++
++	"vshr.u8     d4, d2, #1\n"                   /* d4 = V >> 1 */
++
++	"vmull.u8    q8, d1, d27\n"                  /* q8 = evenY * 149 */
++	"vmull.u8    q9, d3, d27\n"                  /* q9 = oddY * 149 */
++
++	"vld1.16     {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
++	"vsubw.u8    q10, q10, d4\n"                 /* red acc -= (V >> 1) */
++	"vmlsl.u8    q10, d2, d28\n"                 /* red acc -= V * 204 */
++	"vld1.16     {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
++	"vmlsl.u8    q11, d2, d30\n"                 /* green acc -= V * 104 */
++	"vmlsl.u8    q11, d0, d29\n"                 /* green acc -= U * 50 */
++	"vld1.16     {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
++	"vmlsl.u8    q12, d0, d30\n"                 /* blue acc -= U * 104 */
++	"vmlsl.u8    q12, d0, d31\n"                 /* blue acc -= U * 154 */
++
++	"vhsub.s16   q3, q8, q10\n"                  /* calculate even red components */
++	"vhsub.s16   q10, q9, q10\n"                 /* calculate odd red components */
++	"vqshrun.s16 d0, q3, #6\n"                   /* right shift, narrow and saturate even red components */
++	"vqshrun.s16 d3, q10, #6\n"                  /* right shift, narrow and saturate odd red components */
++
++	"vhadd.s16   q3, q8, q11\n"                  /* calculate even green components */
++	"vhadd.s16   q11, q9, q11\n"                 /* calculate odd green components */
++	"vqshrun.s16 d1, q3, #6\n"                   /* right shift, narrow and saturate even green components */
++	"vqshrun.s16 d4, q11, #6\n"                  /* right shift, narrow and saturate odd green components */
++
++	"vhsub.s16   q3, q8, q12\n"                  /* calculate even blue components */
++	"vhsub.s16   q12, q9, q12\n"                 /* calculate odd blue components */
++	"vqshrun.s16 d2, q3, #6\n"                   /* right shift, narrow and saturate even blue components */
++	"vqshrun.s16 d5, q12, #6\n"                  /* right shift, narrow and saturate odd blue components */
++
++	"vzip.8      d0, d3\n"                       /* join even and odd red components */
++	"vzip.8      d1, d4\n"                       /* join even and odd green components */
++	"vzip.8      d2, d5\n"                       /* join even and odd blue components */
++
++	"vshll.u8    q3, d0, #8\n\t"
++	"vshll.u8    q8, d1, #8\n\t"
++	"vshll.u8    q9, d2, #8\n\t"
++	"vsri.u16    q3, q8, #5\t\n"
++	"vsri.u16    q3, q9, #11\t\n"
++	/* store pixel data to memory */
++	".if \\size == 16\n"
++	"    vst1.16 {d6, d7}, [%[dst]]!\n"
++	"    vshll.u8    q3, d3, #8\n\t"
++	"    vshll.u8    q8, d4, #8\n\t"
++	"    vshll.u8    q9, d5, #8\n\t"
++	"    vsri.u16    q3, q8, #5\t\n"
++	"    vsri.u16    q3, q9, #11\t\n"
++	"    vst1.16 {d6, d7}, [%[dst]]!\n"
++	".elseif \\size == 8\n"
++	"    vst1.16 {d6, d7}, [%[dst]]!\n"
++	".elseif \\size == 4\n"
++	"    vst1.16 {d6}, [%[dst]]!\n"
++	".elseif \\size == 2\n"
++	"    vst1.16 {d6[0]}, [%[dst]]!\n"
++	"    vst1.16 {d6[1]}, [%[dst]]!\n"
++	".elseif \\size == 1\n"
++	"    vst1.16 {d6[0]}, [%[dst]]!\n"
++	".endif\n"
++	".endm\n"
++
++	"vmov.u8     d8, #15\n" /* add this to U/V to saturate upper boundary */
++	"vmov.u8     d9, #20\n" /* add this to Y to saturate upper boundary */
++	"vmov.u8     d10, #31\n" /* sub this from U/V to saturate lower boundary */
++	"vmov.u8     d11, #36\n" /* sub this from Y to saturate lower boundary */
++
++	"vmov.u8     d26, #16\n"
++	"vmov.u8     d27, #149\n"
++	"vmov.u8     d28, #204\n"
++	"vmov.u8     d29, #50\n"
++	"vmov.u8     d30, #104\n"
++	"vmov.u8     d31, #154\n"
++
++	"cmp         %[oddflag], #0\n"
++	"beq         1f\n"
++	"convert_macroblock 1\n"
++	"sub         %[n], %[n], #1\n"
++    "1:\n"
++	"subs        %[n], %[n], #16\n"
++	"blt         2f\n"
++    "1:\n"
++	"convert_macroblock 16\n"
++	"subs        %[n], %[n], #16\n"
++	"bge         1b\n"
++    "2:\n"
++	"tst         %[n], #8\n"
++	"beq         3f\n"
++	"convert_macroblock 8\n"
++    "3:\n"
++	"tst         %[n], #4\n"
++	"beq         4f\n"
++	"convert_macroblock 4\n"
++    "4:\n"
++	"tst         %[n], #2\n"
++	"beq         5f\n"
++	"convert_macroblock 2\n"
++    "5:\n"
++	"tst         %[n], #1\n"
++	"beq         6f\n"
++	"convert_macroblock 1\n"
++    "6:\n"
++	".purgem convert_macroblock\n"
++	: [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
++	: [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
++	  [oddflag] "r" (oddflag)
++	: "cc", "memory",
++	  "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
++	  "d8",  "d9",  "d10", "d11", /* "d12", "d13", "d14", "d15", */
++	  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
++	  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
++    );
++}
--- a/gfx/ycbcr/update.sh
+++ b/gfx/ycbcr/update.sh
@@ -2,8 +2,9 @@
 cp $1/media/base/yuv_convert.h .
 cp $1/media/base/yuv_convert.cc yuv_convert.cpp
 cp $1/media/base/yuv_row.h .
 cp $1/media/base/yuv_row_table.cc yuv_row_table.cpp
 cp $1/media/base/yuv_row_posix.cc yuv_row_posix.cpp
 cp $1/media/base/yuv_row_win.cc yuv_row_win.cpp
 cp $1/media/base/yuv_row_posix.cc yuv_row_c.cpp
 patch -p3 <convert.patch
+patch -p3 <arm.patch