Bug 616469 - Video sync is slow because of slow yuv2rgb conversion. ycbcr r=chris.double a=blocking-fennec
authorOleg Romashin <romaxa@gmail.com>
Fri, 10 Dec 2010 21:32:52 +0200
changeset 59198 06802b3064902aae1264f64e991468af547e981a
parent 59197 398c0e73be649e1310b9164f1a75aa43a4dca739
child 59199 fae6da8a664c6d4c73df880c8e1af5aa1dec6993
push idunknown
push userunknown
push dateunknown
reviewerschris.double, blocking-fennec
bugs616469
milestone2.0b9pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 616469 - Video sync is slow because of slow yuv2rgb conversion. ycbcr r=chris.double a=blocking-fennec
gfx/ycbcr/Makefile.in
gfx/ycbcr/yuv_convert.cpp
gfx/ycbcr/yuv_convert.h
gfx/ycbcr/yuv_convert_arm.cpp
--- a/gfx/ycbcr/Makefile.in
+++ b/gfx/ycbcr/Makefile.in
@@ -40,16 +40,21 @@ CPPSRCS += yuv_row_posix.cpp \
 else
 CPPSRCS += yuv_row_other.cpp \
            $(NULL)
 endif # Darwin
 endif # SunOS
 endif # linux
 endif # windows
 
+ifeq (arm,$(findstring arm,$(OS_TEST)))
+CPPSRCS += yuv_convert_arm.cpp \
+           $(NULL)
+endif
+
 EXTRA_DSO_LDOPTS += \
         $(LIBS_DIR) \
         $(EXTRA_DSO_LIBS) \
         $(XPCOM_LIBS) \
         $(NSPR_LIBS) \
         $(NULL)
 
 include $(topsrcdir)/config/rules.mk
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -19,25 +19,56 @@
 #include "yuv_convert.h"
 
 // Header for low level row functions.
 #include "yuv_row.h"
 #define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
 #define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
 #include "mozilla/SSE.h"
 
+#ifdef HAVE_YCBCR_TO_RGB565
+void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
+#endif
+
 namespace mozilla {
 
 namespace gfx {
  
 // 16.16 fixed point arithmetic
 const int kFractionBits = 16;
 const int kFractionMax = 1 << kFractionBits;
 const int kFractionMask = ((1 << kFractionBits) - 1);
 
+
+// Convert a frame of YUV to 16 bit RGB565.
+NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int pic_x,
+                                  int pic_y,
+                                  int pic_width,
+                                  int pic_height,
+                                  int y_pitch,
+                                  int uv_pitch,
+                                  int rgb_pitch,
+                                  YUVType yuv_type)
+{
+#ifdef HAVE_YCBCR_TO_RGB565
+  for (int i = 0; i < pic_height; i++) {
+    yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
+                         y_buf + y_pitch * i,
+                         u_buf + uv_pitch * (i / 2),
+                         v_buf + uv_pitch * (i / 2),
+                         pic_width,
+                         0);
+  }
+#endif
+}
+
 // Convert a frame of YUV to 32 bit ARGB.
 NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
                                   uint8* rgb_buf,
                                   int pic_x,
                                   int pic_y,
                                   int pic_width,
--- a/gfx/ycbcr/yuv_convert.h
+++ b/gfx/ycbcr/yuv_convert.h
@@ -2,16 +2,20 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #ifndef MEDIA_BASE_YUV_CONVERT_H_
 #define MEDIA_BASE_YUV_CONVERT_H_
 
 #include "chromium_types.h"
 #include "gfxCore.h"
+
+#ifdef __arm__
+#define HAVE_YCBCR_TO_RGB565 1
+#endif
  
 namespace mozilla {
 
 namespace gfx {
  
 // Type of YUV surface.
 // The value of these enums matter as they are used to shift vertical indices.
 enum YUVType {
@@ -36,16 +40,31 @@ enum Rotate {
 // Filter affects how scaling looks.
 enum ScaleFilter {
   FILTER_NONE = 0,        // No filter (point sampled).
   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
   FILTER_BILINEAR = 3     // Bilinear filter.
 };
 
+// Convert a frame of YUV to 16 bit RGB565.
+// Pass in YV12 formats
+NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
+                                  const uint8* uplane,
+                                  const uint8* vplane,
+                                  uint8* rgbframe,
+                                  int pic_x,
+                                  int pic_y,
+                                  int pic_width,
+                                  int pic_height,
+                                  int ystride,
+                                  int uvstride,
+                                  int rgbstride,
+                                  YUVType yuv_type);
+
 // Convert a frame of YUV to 32 bit ARGB.
 // Pass in YV16/YV12 depending on source format
 NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
                                   const uint8* uplane,
                                   const uint8* vplane,
                                   uint8* rgbframe,
                                   int pic_x,
                                   int pic_y,
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_arm.cpp
@@ -0,0 +1,201 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
+
+#include "yuv_convert.h"
+
+void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag)
+{
+    static __attribute__((aligned(16))) uint16 acc_r[8] = {
+        22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
+    };
+    static __attribute__((aligned(16))) uint16 acc_g[8] = {
+        17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
+    };
+    static __attribute__((aligned(16))) uint16 acc_b[8] = {
+        28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
+    };
+    /*
+     * Registers:
+     * q0, q1 : d0, d1, d2, d3  - are used for initial loading of YUV data
+     * q2     : d4, d5          - are used for storing converted RGB data
+     * q3     : d6, d7          - are used for temporary storage
+     *
+     * q4-q7 - reserved
+     *
+     * q8, q9 : d16, d17, d18, d19  - are used for expanded Y data
+     * q10    : d20, d21
+     * q11    : d22, d23
+     * q12    : d24, d25
+     * q13    : d26, d27
+     * q13, q14, q15            - various constants (#16, #149, #204, #50, #104, #154)
+     */
+    asm volatile (
+".fpu neon\n"
+".macro convert_macroblock size\n"
+/* load up to 16 source pixels */
+	".if \\size == 16\n"
+	    "pld [%[y], #64]\n"
+	    "pld [%[u], #64]\n"
+	    "pld [%[v], #64]\n"
+	    "vld1.8 {d1}, [%[y]]!\n"
+	    "vld1.8 {d3}, [%[y]]!\n"
+	    "vld1.8 {d0}, [%[u]]!\n"
+	    "vld1.8 {d2}, [%[v]]!\n"
+	".elseif \\size == 8\n"
+	    "vld1.8 {d1}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d0[1]}, [%[u]]!\n"
+	    "vld1.8 {d0[2]}, [%[u]]!\n"
+	    "vld1.8 {d0[3]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	    "vld1.8 {d2[1]}, [%[v]]!\n"
+	    "vld1.8 {d2[2]}, [%[v]]!\n"
+	    "vld1.8 {d2[3]}, [%[v]]!\n"
+	".elseif \\size == 4\n"
+	    "vld1.8 {d1[0]}, [%[y]]!\n"
+	    "vld1.8 {d1[1]}, [%[y]]!\n"
+	    "vld1.8 {d1[2]}, [%[y]]!\n"
+	    "vld1.8 {d1[3]}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d0[1]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	    "vld1.8 {d2[1]}, [%[v]]!\n"
+	".elseif \\size == 2\n"
+	    "vld1.8 {d1[0]}, [%[y]]!\n"
+	    "vld1.8 {d1[1]}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	".elseif \\size == 1\n"
+	    "vld1.8 {d1[0]}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	".else\n"
+	    ".error \"unsupported macroblock size\"\n"
+	".endif\n"
+
+        /* d1 - Y data (first 8 bytes) */
+        /* d3 - Y data (next 8 bytes) */
+        /* d0 - U data, d2 - V data */
+
+	/* split even and odd Y color components */
+	"vuzp.8      d1, d3\n"                       /* d1 - evenY, d3 - oddY */
+	/* clip upper and lower boundaries */
+	"vqadd.u8    q0, q0, q4\n"
+	"vqadd.u8    q1, q1, q4\n"
+	"vqsub.u8    q0, q0, q5\n"
+	"vqsub.u8    q1, q1, q5\n"
+
+	"vshr.u8     d4, d2, #1\n"                   /* d4 = V >> 1 */
+
+	"vmull.u8    q8, d1, d27\n"                  /* q8 = evenY * 149 */
+	"vmull.u8    q9, d3, d27\n"                  /* q9 = oddY * 149 */
+
+	"vld1.16     {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
+	"vsubw.u8    q10, q10, d4\n"                 /* red acc -= (V >> 1) */
+	"vmlsl.u8    q10, d2, d28\n"                 /* red acc -= V * 204 */
+	"vld1.16     {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
+	"vmlsl.u8    q11, d2, d30\n"                 /* green acc -= V * 104 */
+	"vmlsl.u8    q11, d0, d29\n"                 /* green acc -= U * 50 */
+	"vld1.16     {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
+	"vmlsl.u8    q12, d0, d30\n"                 /* blue acc -= U * 104 */
+	"vmlsl.u8    q12, d0, d31\n"                 /* blue acc -= U * 154 */
+
+	"vhsub.s16   q3, q8, q10\n"                  /* calculate even red components */
+	"vhsub.s16   q10, q9, q10\n"                 /* calculate odd red components */
+	"vqshrun.s16 d0, q3, #6\n"                   /* right shift, narrow and saturate even red components */
+	"vqshrun.s16 d3, q10, #6\n"                  /* right shift, narrow and saturate odd red components */
+
+	"vhadd.s16   q3, q8, q11\n"                  /* calculate even green components */
+	"vhadd.s16   q11, q9, q11\n"                 /* calculate odd green components */
+	"vqshrun.s16 d1, q3, #6\n"                   /* right shift, narrow and saturate even green components */
+	"vqshrun.s16 d4, q11, #6\n"                  /* right shift, narrow and saturate odd green components */
+
+	"vhsub.s16   q3, q8, q12\n"                  /* calculate even blue components */
+	"vhsub.s16   q12, q9, q12\n"                 /* calculate odd blue components */
+	"vqshrun.s16 d2, q3, #6\n"                   /* right shift, narrow and saturate even blue components */
+	"vqshrun.s16 d5, q12, #6\n"                  /* right shift, narrow and saturate odd blue components */
+
+	"vzip.8      d0, d3\n"                       /* join even and odd red components */
+	"vzip.8      d1, d4\n"                       /* join even and odd green components */
+	"vzip.8      d2, d5\n"                       /* join even and odd blue components */
+
+	"vshll.u8    q3, d0, #8\n\t"
+	"vshll.u8    q8, d1, #8\n\t"
+	"vshll.u8    q9, d2, #8\n\t"
+	"vsri.u16    q3, q8, #5\t\n"
+	"vsri.u16    q3, q9, #11\t\n"
+	/* store pixel data to memory */
+	".if \\size == 16\n"
+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
+	"    vshll.u8    q3, d3, #8\n\t"
+	"    vshll.u8    q8, d4, #8\n\t"
+	"    vshll.u8    q9, d5, #8\n\t"
+	"    vsri.u16    q3, q8, #5\t\n"
+	"    vsri.u16    q3, q9, #11\t\n"
+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
+	".elseif \\size == 8\n"
+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
+	".elseif \\size == 4\n"
+	"    vst1.16 {d6}, [%[dst]]!\n"
+	".elseif \\size == 2\n"
+	"    vst1.16 {d6[0]}, [%[dst]]!\n"
+	"    vst1.16 {d6[1]}, [%[dst]]!\n"
+	".elseif \\size == 1\n"
+	"    vst1.16 {d6[0]}, [%[dst]]!\n"
+	".endif\n"
+	".endm\n"
+
+	"vmov.u8     d8, #15\n" /* add this to U/V to saturate upper boundary */
+	"vmov.u8     d9, #20\n" /* add this to Y to saturate upper boundary */
+	"vmov.u8     d10, #31\n" /* sub this from U/V to saturate lower boundary */
+	"vmov.u8     d11, #36\n" /* sub this from Y to saturate lower boundary */
+
+	"vmov.u8     d26, #16\n"
+	"vmov.u8     d27, #149\n"
+	"vmov.u8     d28, #204\n"
+	"vmov.u8     d29, #50\n"
+	"vmov.u8     d30, #104\n"
+	"vmov.u8     d31, #154\n"
+
+	"cmp         %[oddflag], #0\n"
+	"beq         1f\n"
+	"convert_macroblock 1\n"
+	"sub         %[n], %[n], #1\n"
+    "1:\n"
+	"subs        %[n], %[n], #16\n"
+	"blt         2f\n"
+    "1:\n"
+	"convert_macroblock 16\n"
+	"subs        %[n], %[n], #16\n"
+	"bge         1b\n"
+    "2:\n"
+	"tst         %[n], #8\n"
+	"beq         3f\n"
+	"convert_macroblock 8\n"
+    "3:\n"
+	"tst         %[n], #4\n"
+	"beq         4f\n"
+	"convert_macroblock 4\n"
+    "4:\n"
+	"tst         %[n], #2\n"
+	"beq         5f\n"
+	"convert_macroblock 2\n"
+    "5:\n"
+	"tst         %[n], #1\n"
+	"beq         6f\n"
+	"convert_macroblock 1\n"
+    "6:\n"
+	".purgem convert_macroblock\n"
+	: [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
+	: [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
+	  [oddflag] "r" (oddflag)
+	: "cc", "memory",
+	  "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+	  "d8",  "d9",  "d10", "d11", /* "d12", "d13", "d14", "d15", */
+	  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+	  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+    );
+}