Bug 754364. Add bilinear non-repeat and repeat fast paths. r=joe,a=joe
authorJeff Muizelaar <jmuizelaar@mozilla.com>
Tue, 15 May 2012 18:26:16 -0400
changeset 94301 0fc9cddcb19aef6bf828780bba2acdb6a0511c68
parent 94300 63727a5fa570e0f0aa0ca125541d87805b8c5dad
child 94302 b1c12c3c5841ef17586b128ac31bc19e0d8b56ba
push id1313
push userjmuizelaar@mozilla.com
push dateFri, 18 May 2012 21:07:29 +0000
treeherdermozilla-aurora@b1c12c3c5841 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjoe, joe
bugs754364
milestone14.0a2
Bug 754364. Add bilinear non-repeat and repeat fast paths. r=joe,a=joe This gives significantly faster bilinear sampling when we don't have NEON.
gfx/cairo/README
gfx/cairo/libpixman/src/pixman-fast-path.c
gfx/cairo/libpixman/src/pixman-inlines.h
gfx/cairo/pixman-bilinear-fastpath.patch
layout/reftests/transform-3d/reftest.list
--- a/gfx/cairo/README
+++ b/gfx/cairo/README
@@ -189,11 +189,13 @@ pixman-rename-and-endian.patch: include 
 NOTE: we previously supported ARM assembler on MSVC, this has been removed because of the maintenance burden
 
 pixman-export.patch: use cairo_public for PIXMAN_EXPORT to make sure pixman symbols are not exported in libxul
 
 pixman-limits.patch: include limits.h for SIZE_MAX
 
 pixman-lowres-interp.patch: Use lower quality interpolation for more speed.
 
+pixman-bilinear-fastpath.patch: Bilinear fast paths for non-neon
+
 ==== disable printing patch ====
 
 disable-printing.patch:  allows us to use NS_PRINTING to disable printing.
--- a/gfx/cairo/libpixman/src/pixman-fast-path.c
+++ b/gfx/cairo/libpixman/src/pixman-fast-path.c
@@ -1186,16 +1186,228 @@ FAST_NEAREST (8888_565_none, 8888, 0565,
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
 
+static force_inline void
+scaled_bilinear_scanline_8888_565_OVER (uint16_t *       dst,
+                                        const uint32_t * mask,
+                                        const uint32_t * src_top,
+                                        const uint32_t * src_bottom,
+                                        int32_t          w,
+                                        int              wt,
+                                        int              wb,
+                                        pixman_fixed_t   vx,
+                                        pixman_fixed_t   unit_x,
+                                        pixman_fixed_t   max_vx,
+                                        pixman_bool_t    zero_src)
+{
+    while ((w -= 1) >= 0)
+    {
+	uint32_t tl = src_top [pixman_fixed_to_int (vx)];
+	uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];
+	uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];
+	uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
+	uint32_t src, result;
+	uint16_t d;
+	d = *dst;
+	src = bilinear_interpolation (tl, tr,
+				      bl, br,
+				      interpolation_coord(vx),
+				      wb >> (8 - INTERPOLATION_PRECISION_BITS));
+	vx += unit_x;
+	result = over (src, CONVERT_0565_TO_0888 (d));
+	*dst++ = CONVERT_8888_TO_0565(result);
+    }
+}
+
+static force_inline void
+scaled_bilinear_scanline_8888_8888_OVER (uint32_t *       dst,
+                                         const uint32_t * mask,
+                                         const uint32_t * src_top,
+                                         const uint32_t * src_bottom,
+                                         int32_t          w,
+                                         int              wt,
+                                         int              wb,
+                                         pixman_fixed_t   vx,
+                                         pixman_fixed_t   unit_x,
+                                         pixman_fixed_t   max_vx,
+                                         pixman_bool_t    zero_src)
+{
+    while ((w -= 1) >= 0)
+    {
+	uint32_t tl = src_top [pixman_fixed_to_int (vx)];
+	uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];
+	uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];
+	uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
+	uint32_t src;
+	uint32_t d;
+	uint32_t result;
+	d = *dst;
+	src = bilinear_interpolation (tl, tr,
+				      bl, br,
+				      interpolation_coord(vx),
+				      wb >> (8 - INTERPOLATION_PRECISION_BITS));
+	vx += unit_x;
+	*dst++ = over (src, d);
+    }
+}
+
+#if 1
+
+static force_inline void
+scaled_bilinear_scanline_565_565_SRC (uint16_t *       dst,
+				      const uint32_t * mask,
+				      const uint16_t * src_top,
+				      const uint16_t * src_bottom,
+				      int32_t          w,
+				      int              wt,
+				      int              wb,
+				      pixman_fixed_t   vx,
+				      pixman_fixed_t   unit_x,
+				      pixman_fixed_t   max_vx,
+				      pixman_bool_t    zero_src)
+{
+    while ((w -= 1) >= 0)
+    {
+	uint16_t tl = src_top [pixman_fixed_to_int (vx)];
+	uint16_t tr = src_top [pixman_fixed_to_int (vx) + 1];
+	uint16_t bl = src_bottom [pixman_fixed_to_int (vx)];
+	uint16_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
+	uint32_t d;
+	d = bilinear_interpolation(CONVERT_0565_TO_8888(tl),
+				   CONVERT_0565_TO_8888(tr),
+				   CONVERT_0565_TO_8888(bl),
+				   CONVERT_0565_TO_8888(br),
+				   interpolation_coord(vx),
+				   wb >> (8 - INTERPOLATION_PRECISION_BITS));
+	vx += unit_x;
+	*dst++ = CONVERT_8888_TO_0565(d);
+    }
+}
+
+#else
+
+#define SK_G16_MASK_IN_PLACE 0xfc0
+
+static inline uint32_t SkExpand_rgb_16(uint16_t c) {
+
+    return ((c & SK_G16_MASK_IN_PLACE) << 16) | (c & ~SK_G16_MASK_IN_PLACE);
+}
+
+/** Compress an expanded value (from SkExpand_rgb_16) back down to a 16bit
+    color value. The computation yields only 16bits of valid data, but we claim
+    to return 32bits, so that the compiler won't generate extra instructions to
+    "clean" the top 16bits. However, the top 16 can contain garbage, so it is
+    up to the caller to safely ignore them.
+*/
+static inline uint16_t SkCompact_rgb_16(uint32_t c) {
+    return ((c >> 16) & SK_G16_MASK_IN_PLACE) | (c & ~SK_G16_MASK_IN_PLACE);
+}
+// returns expanded * 5bits
+static inline uint32_t Filter_565_Expanded(unsigned x, unsigned y,
+                                           uint32_t a00, uint32_t a01,
+                                           uint32_t a10, uint32_t a11) {
+    a00 = SkExpand_rgb_16(a00);
+    a01 = SkExpand_rgb_16(a01);
+    a10 = SkExpand_rgb_16(a10);
+    a11 = SkExpand_rgb_16(a11);
+    
+    int xy = x * y >> 3;
+    return  a00 * (32 - 2*y - 2*x + xy) +
+            a01 * (2*x - xy) +
+            a10 * (2*y - xy) +
+            a11 * xy;
+}
+
+
+
+static force_inline void
+scaled_bilinear_scanline_565_565_SRC (uint16_t *       dst,
+				      const uint32_t * mask,
+				      const uint16_t * src_top,
+				      const uint16_t * src_bottom,
+				      int32_t          w,
+				      int              wt,
+				      int              wb,
+				      pixman_fixed_t   vx,
+				      pixman_fixed_t   unit_x,
+				      pixman_fixed_t   max_vx,
+				      pixman_bool_t    zero_src)
+{
+    while ((w -= 1) >= 0)
+    {
+	uint16_t tl = src_top [pixman_fixed_to_int (vx)];
+	uint16_t tr = src_top [pixman_fixed_to_int (vx) + 1];
+	uint16_t bl = src_bottom [pixman_fixed_to_int (vx)];
+	uint16_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
+
+        uint32_t tmp = Filter_565_Expanded((vx>>12)&0xf, wb>>4, tl, tr, bl, br);
+        vx += unit_x;
+        *dst++ = SkCompact_rgb_16((tmp) >> 5);
+    }
+}
+
+
+#endif
+FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC,
+			       scaled_bilinear_scanline_565_565_SRC,
+			       uint16_t, uint32_t, uint16_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC,
+			       scaled_bilinear_scanline_565_565_SRC,
+			       uint16_t, uint32_t, uint16_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC,
+			       scaled_bilinear_scanline_565_565_SRC,
+			       uint16_t, uint32_t, uint16_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC,
+			       scaled_bilinear_scanline_565_565_SRC,
+			       uint16_t, uint32_t, uint16_t,
+			       NORMAL, FLAG_NONE)
+
+FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER,
+			       scaled_bilinear_scanline_8888_565_OVER,
+			       uint32_t, uint32_t, uint16_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER,
+			       scaled_bilinear_scanline_8888_565_OVER,
+			       uint32_t, uint32_t, uint16_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER,
+			       scaled_bilinear_scanline_8888_565_OVER,
+			       uint32_t, uint32_t, uint16_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER,
+			       scaled_bilinear_scanline_8888_565_OVER,
+			       uint32_t, uint32_t, uint16_t,
+			       NORMAL, FLAG_NONE)
+
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER,
+			       scaled_bilinear_scanline_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
 #define REPEAT_MIN_WIDTH    32
 
 static void
 fast_composite_tiled_repeat (pixman_implementation_t *imp,
 			     pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
     pixman_composite_func_t func;
@@ -1960,16 +2172,20 @@ static const pixman_fast_path_t c_fast_p
 	PIXMAN_any,
 	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
 	 FAST_PATH_NORMAL_REPEAT),
 	PIXMAN_any, 0,
 	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
 	fast_composite_tiled_repeat
     },
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+
     {   PIXMAN_OP_NONE	},
 };
 
 #ifdef WORDS_BIGENDIAN
 #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n)))
 #else
 #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
 #endif
--- a/gfx/cairo/libpixman/src/pixman-inlines.h
+++ b/gfx/cairo/libpixman/src/pixman-inlines.h
@@ -80,16 +80,21 @@ repeat (pixman_repeat_t repeat, int *c, 
     }
     return TRUE;
 }
 
 #ifdef MOZ_GFX_OPTIMIZE_MOBILE
 #define LOW_QUALITY_INTERPOLATION
 #endif
 
+#ifdef LOW_QUALITY_INTERPOLATION
+#define INTERPOLATION_PRECISION_BITS 4
+#else
+#define INTERPOLATION_PRECISION_BITS 8
+#endif
 static force_inline int32_t
 interpolation_coord(pixman_fixed_t t)
 {
 #ifdef LOW_QUALITY_INTERPOLATION
     return (t >> 12) & 0xf;
 #else
     return (t >> 8) & 0xff;
 #endif
new file mode 100644
--- /dev/null
+++ b/gfx/cairo/pixman-bilinear-fastpath.patch
@@ -0,0 +1,287 @@
+changeset:   94061:73a9b24d863a
+tag:         bilin
+tag:         qbase
+tag:         qtip
+tag:         tip
+user:        Jeff Muizelaar <jmuizelaar@mozilla.com>
+date:        Tue May 15 18:26:16 2012 -0400
+summary:     Bug 754364. Add bilinear non-repeat and repeat fast paths. r=joe
+
+diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c
+--- a/gfx/cairo/libpixman/src/pixman-fast-path.c
++++ b/gfx/cairo/libpixman/src/pixman-fast-path.c
+@@ -1186,16 +1186,228 @@ FAST_NEAREST (8888_565_none, 8888, 0565,
+ FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+ FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+ FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+ FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+ FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+ FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+ FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+ 
++static force_inline void
++scaled_bilinear_scanline_8888_565_OVER (uint16_t *       dst,
++                                        const uint32_t * mask,
++                                        const uint32_t * src_top,
++                                        const uint32_t * src_bottom,
++                                        int32_t          w,
++                                        int              wt,
++                                        int              wb,
++                                        pixman_fixed_t   vx,
++                                        pixman_fixed_t   unit_x,
++                                        pixman_fixed_t   max_vx,
++                                        pixman_bool_t    zero_src)
++{
++    while ((w -= 1) >= 0)
++    {
++	uint32_t tl = src_top [pixman_fixed_to_int (vx)];
++	uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];
++	uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];
++	uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
++	uint32_t src, result;
++	uint16_t d;
++	d = *dst;
++	src = bilinear_interpolation (tl, tr,
++				      bl, br,
++				      interpolation_coord(vx),
++				      wb >> (8 - INTERPOLATION_PRECISION_BITS));
++	vx += unit_x;
++	result = over (src, CONVERT_0565_TO_0888 (d));
++	*dst++ = CONVERT_8888_TO_0565(result);
++    }
++}
++
++static force_inline void
++scaled_bilinear_scanline_8888_8888_OVER (uint32_t *       dst,
++                                         const uint32_t * mask,
++                                         const uint32_t * src_top,
++                                         const uint32_t * src_bottom,
++                                         int32_t          w,
++                                         int              wt,
++                                         int              wb,
++                                         pixman_fixed_t   vx,
++                                         pixman_fixed_t   unit_x,
++                                         pixman_fixed_t   max_vx,
++                                         pixman_bool_t    zero_src)
++{
++    while ((w -= 1) >= 0)
++    {
++	uint32_t tl = src_top [pixman_fixed_to_int (vx)];
++	uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];
++	uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];
++	uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
++	uint32_t src;
++	uint32_t d;
++	uint32_t result;
++	d = *dst;
++	src = bilinear_interpolation (tl, tr,
++				      bl, br,
++				      interpolation_coord(vx),
++				      wb >> (8 - INTERPOLATION_PRECISION_BITS));
++	vx += unit_x;
++	*dst++ = over (src, d);
++    }
++}
++
++#if 1
++
++static force_inline void
++scaled_bilinear_scanline_565_565_SRC (uint16_t *       dst,
++				      const uint32_t * mask,
++				      const uint16_t * src_top,
++				      const uint16_t * src_bottom,
++				      int32_t          w,
++				      int              wt,
++				      int              wb,
++				      pixman_fixed_t   vx,
++				      pixman_fixed_t   unit_x,
++				      pixman_fixed_t   max_vx,
++				      pixman_bool_t    zero_src)
++{
++    while ((w -= 1) >= 0)
++    {
++	uint16_t tl = src_top [pixman_fixed_to_int (vx)];
++	uint16_t tr = src_top [pixman_fixed_to_int (vx) + 1];
++	uint16_t bl = src_bottom [pixman_fixed_to_int (vx)];
++	uint16_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
++	uint32_t d;
++	d = bilinear_interpolation(CONVERT_0565_TO_8888(tl),
++				   CONVERT_0565_TO_8888(tr),
++				   CONVERT_0565_TO_8888(bl),
++				   CONVERT_0565_TO_8888(br),
++				   interpolation_coord(vx),
++				   wb >> (8 - INTERPOLATION_PRECISION_BITS));
++	vx += unit_x;
++	*dst++ = CONVERT_8888_TO_0565(d);
++    }
++}
++
++#else
++
++#define SK_G16_MASK_IN_PLACE 0xfc0
++
++static inline uint32_t SkExpand_rgb_16(uint16_t c) {
++
++    return ((c & SK_G16_MASK_IN_PLACE) << 16) | (c & ~SK_G16_MASK_IN_PLACE);
++}
++
++/** Compress an expanded value (from SkExpand_rgb_16) back down to a 16bit
++    color value. The computation yields only 16bits of valid data, but we claim
++    to return 32bits, so that the compiler won't generate extra instructions to
++    "clean" the top 16bits. However, the top 16 can contain garbage, so it is
++    up to the caller to safely ignore them.
++*/
++static inline uint16_t SkCompact_rgb_16(uint32_t c) {
++    return ((c >> 16) & SK_G16_MASK_IN_PLACE) | (c & ~SK_G16_MASK_IN_PLACE);
++}
++// returns expanded * 5bits
++static inline uint32_t Filter_565_Expanded(unsigned x, unsigned y,
++                                           uint32_t a00, uint32_t a01,
++                                           uint32_t a10, uint32_t a11) {
++    a00 = SkExpand_rgb_16(a00);
++    a01 = SkExpand_rgb_16(a01);
++    a10 = SkExpand_rgb_16(a10);
++    a11 = SkExpand_rgb_16(a11);
++    
++    int xy = x * y >> 3;
++    return  a00 * (32 - 2*y - 2*x + xy) +
++            a01 * (2*x - xy) +
++            a10 * (2*y - xy) +
++            a11 * xy;
++}
++
++
++
++static force_inline void
++scaled_bilinear_scanline_565_565_SRC (uint16_t *       dst,
++				      const uint32_t * mask,
++				      const uint16_t * src_top,
++				      const uint16_t * src_bottom,
++				      int32_t          w,
++				      int              wt,
++				      int              wb,
++				      pixman_fixed_t   vx,
++				      pixman_fixed_t   unit_x,
++				      pixman_fixed_t   max_vx,
++				      pixman_bool_t    zero_src)
++{
++    while ((w -= 1) >= 0)
++    {
++	uint16_t tl = src_top [pixman_fixed_to_int (vx)];
++	uint16_t tr = src_top [pixman_fixed_to_int (vx) + 1];
++	uint16_t bl = src_bottom [pixman_fixed_to_int (vx)];
++	uint16_t br = src_bottom [pixman_fixed_to_int (vx) + 1];
++
++        uint32_t tmp = Filter_565_Expanded((vx>>12)&0xf, wb>>4, tl, tr, bl, br);
++        vx += unit_x;
++        *dst++ = SkCompact_rgb_16((tmp) >> 5);
++    }
++}
++
++
++#endif
++FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC,
++			       scaled_bilinear_scanline_565_565_SRC,
++			       uint16_t, uint32_t, uint16_t,
++			       COVER, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC,
++			       scaled_bilinear_scanline_565_565_SRC,
++			       uint16_t, uint32_t, uint16_t,
++			       PAD, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC,
++			       scaled_bilinear_scanline_565_565_SRC,
++			       uint16_t, uint32_t, uint16_t,
++			       NONE, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC,
++			       scaled_bilinear_scanline_565_565_SRC,
++			       uint16_t, uint32_t, uint16_t,
++			       NORMAL, FLAG_NONE)
++
++FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER,
++			       scaled_bilinear_scanline_8888_565_OVER,
++			       uint32_t, uint32_t, uint16_t,
++			       COVER, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER,
++			       scaled_bilinear_scanline_8888_565_OVER,
++			       uint32_t, uint32_t, uint16_t,
++			       PAD, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER,
++			       scaled_bilinear_scanline_8888_565_OVER,
++			       uint32_t, uint32_t, uint16_t,
++			       NONE, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER,
++			       scaled_bilinear_scanline_8888_565_OVER,
++			       uint32_t, uint32_t, uint16_t,
++			       NORMAL, FLAG_NONE)
++
++FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER,
++			       scaled_bilinear_scanline_8888_8888_OVER,
++			       uint32_t, uint32_t, uint32_t,
++			       COVER, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER,
++			       scaled_bilinear_scanline_8888_8888_OVER,
++			       uint32_t, uint32_t, uint32_t,
++			       PAD, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER,
++			       scaled_bilinear_scanline_8888_8888_OVER,
++			       uint32_t, uint32_t, uint32_t,
++			       NONE, FLAG_NONE)
++FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER,
++			       scaled_bilinear_scanline_8888_8888_OVER,
++			       uint32_t, uint32_t, uint32_t,
++			       NORMAL, FLAG_NONE)
++
+ #define REPEAT_MIN_WIDTH    32
+ 
+ static void
+ fast_composite_tiled_repeat (pixman_implementation_t *imp,
+ 			     pixman_composite_info_t *info)
+ {
+     PIXMAN_COMPOSITE_ARGS (info);
+     pixman_composite_func_t func;
+@@ -1960,16 +2172,20 @@ static const pixman_fast_path_t c_fast_p
+ 	PIXMAN_any,
+ 	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
+ 	 FAST_PATH_NORMAL_REPEAT),
+ 	PIXMAN_any, 0,
+ 	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
+ 	fast_composite_tiled_repeat
+     },
+ 
++    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
++    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
++    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
++
+     {   PIXMAN_OP_NONE	},
+ };
+ 
+ #ifdef WORDS_BIGENDIAN
+ #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n)))
+ #else
+ #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
+ #endif
+diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h
+--- a/gfx/cairo/libpixman/src/pixman-inlines.h
++++ b/gfx/cairo/libpixman/src/pixman-inlines.h
+@@ -80,16 +80,21 @@ repeat (pixman_repeat_t repeat, int *c, 
+     }
+     return TRUE;
+ }
+ 
+ #ifdef MOZ_GFX_OPTIMIZE_MOBILE
+ #define LOW_QUALITY_INTERPOLATION
+ #endif
+ 
++#ifdef LOW_QUALITY_INTERPOLATION
++#define INTERPOLATION_PRECISION_BITS 4
++#else
++#define INTERPOLATION_PRECISION_BITS 8
++#endif
+ static force_inline int32_t
+ interpolation_coord(pixman_fixed_t t)
+ {
+ #ifdef LOW_QUALITY_INTERPOLATION
+     return (t >> 12) & 0xf;
+ #else
+     return (t >> 8) & 0xff;
+ #endif
--- a/layout/reftests/transform-3d/reftest.list
+++ b/layout/reftests/transform-3d/reftest.list
@@ -3,17 +3,17 @@
 # Check that scaleZ(-1) rotateX(180deg) is the same as rotateY(180deg)
 == scalezrotatex-1.html scalezrotatex-1-ref.html
 # Check that the perspectve() transform function results in some visual changes
 != rotatex-perspective-1a.html rotatex-1-ref.html
 # Check that -moz-perspective results in visual changes to child transformed elements
 != rotatex-perspective-1b.html rotatex-1-ref.html
 # -moz-perspective should only apply to child elements
 == rotatex-perspective-1c.html rotatex-1-ref.html
-== rotatex-perspective-3a.html rotatex-perspective-3-ref.html
+fails-if(Android) == rotatex-perspective-3a.html rotatex-perspective-3-ref.html # bug 755543
 == scalez-1a.html scalez-1-ref.html
 fails-if(cocoaWidget) random-if(/^Windows\x20NT\x206\.1/.test(http.oscpu)) == preserve3d-1a.html preserve3d-1-ref.html
 == preserve3d-1b.html about:blank
 == preserve3d-clipped.html about:blank 
 == preserve3d-2a.html preserve3d-2-ref.html
 == preserve3d-2b.html preserve3d-2-ref.html
 == preserve3d-2c.html preserve3d-2-ref.html
 == preserve3d-2d.html preserve3d-2-ref.html