Bug 754364. pixman: Add scaled nearest repeat fast paths. r=jrmuizel
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>
Sat, 12 May 2012 10:01:17 -0400
changeset 93845 d61928d439b4e9335f3bd1464ed9bf3a219129b8
parent 93839 c5023518db2f092fc7320f530ee4c33c9778e751
child 93846 c216e50bdc0d7420b678bbf71dabd2e6baa87e82
push id22681
push userwmccloskey@mozilla.com
push dateMon, 14 May 2012 00:26:22 +0000
treeherdermozilla-central@ac968ff4fe41 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs754364
milestone15.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 754364. pixman: Add scaled nearest repeat fast paths. r=jrmuizel This will allows us to scale and repeat in a single pass instead of doing it in two.
gfx/cairo/libpixman/src/pixman-arm-common.h
gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
gfx/cairo/libpixman/src/pixman-fast-path.c
gfx/cairo/libpixman/src/pixman-inlines.h
gfx/cairo/libpixman/src/pixman-sse2.c
--- a/gfx/cairo/libpixman/src/pixman-arm-common.h
+++ b/gfx/cairo/libpixman/src/pixman-arm-common.h
@@ -231,90 +231,102 @@ cputype##_composite_##name (pixman_imple
 #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
                                                src_type, dst_type)            \
 void                                                                          \
 pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    int32_t          w,        \
                                                    dst_type *       dst,      \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x);  \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx);  \
                                                                               \
 static force_inline void                                                      \
 scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
                                                    const src_type * ps,       \
                                                    int32_t          w,        \
                                                    pixman_fixed_t   vx,       \
                                                    pixman_fixed_t   unit_x,   \
                                                    pixman_fixed_t   max_vx,   \
                                                    pixman_bool_t    zero_src) \
 {                                                                             \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
-                                                                  vx, unit_x);\
+                                                                  vx, unit_x, \
+                                                                  max_vx);    \
 }                                                                             \
                                                                               \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
                        src_type, dst_type, COVER)                             \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
                        src_type, dst_type, NONE)                              \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, PAD)
+                       src_type, dst_type, PAD)                               \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NORMAL)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
     SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
 
 #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
                                                   src_type, dst_type)         \
 void                                                                          \
 pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    int32_t          w,        \
                                                    dst_type *       dst,      \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
                                                    pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
                                                    const uint8_t *  mask);    \
                                                                               \
 static force_inline void                                                      \
 scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
                                                    dst_type *       pd,       \
                                                    const src_type * ps,       \
                                                    int32_t          w,        \
                                                    pixman_fixed_t   vx,       \
                                                    pixman_fixed_t   unit_x,   \
                                                    pixman_fixed_t   max_vx,   \
                                                    pixman_bool_t    zero_src) \
 {                                                                             \
     if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
 	return;                                                               \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
                                                                   vx, unit_x, \
+                                                                  max_vx,     \
                                                                   mask);      \
 }                                                                             \
                                                                               \
 FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
                               src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
 FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
                               src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
 FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
 
 /*****************************************************************************/
 
 #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
                                                 src_type, dst_type)           \
 void                                                                          \
 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
                                                 dst_type *       dst,         \
--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
@@ -207,47 +207,59 @@
 
 /*
  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
  * aliases to be defined)
  */
 .macro pixld1_s elem_size, reg1, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #1
     vld1.16 {d&reg1&[0]}, [TMP1, :16]
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     vld1.16 {d&reg1&[1]}, [TMP2, :16]
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #1
     vld1.16 {d&reg1&[2]}, [TMP1, :16]
     vld1.16 {d&reg1&[3]}, [TMP2, :16]
 .elseif elem_size == 32
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #2
     vld1.32 {d&reg1&[0]}, [TMP1, :32]
     vld1.32 {d&reg1&[1]}, [TMP2, :32]
 .else
     .error "unsupported"
 .endif
 .endm
 
 .macro pixld2_s elem_size, reg1, reg2, mem_operand
-.if elem_size == 32
+.if 0 /* elem_size == 32 */
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
     add     TMP1, mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
     sub     VX, VX, UNIT_X
     add     TMP2, mem_operand, TMP2, asl #2
     vld1.32 {d&reg1&[0]}, [TMP1, :32]
     mov     TMP1, VX, asr #16
@@ -263,22 +275,26 @@
     pixld1_s elem_size, reg1, mem_operand
     pixld1_s elem_size, reg2, mem_operand
 .endif
 .endm
 
 .macro pixld0_s elem_size, reg1, idx, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
 .elseif elem_size == 32
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #2
     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
 .endif
 .endm
 
 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
 .if numbytes == 32
     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
@@ -959,25 +975,27 @@ fname:
     DST_W       .req        r1
     SRC         .req        r2
     VX          .req        r3
     UNIT_X      .req        ip
     MASK        .req        lr
     TMP1        .req        r4
     TMP2        .req        r5
     DST_R       .req        r6
+    SRC_WIDTH_FIXED .req        r7
 
     .macro pixld_src x:vararg
         pixld_s x
     .endm
 
     ldr         UNIT_X, [sp]
-    push        {r4-r6, lr}
+    push        {r4-r8, lr}
+    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
     .if mask_bpp != 0
-    ldr         MASK, [sp, #(16 + 4)]
+    ldr         MASK, [sp, #(24 + 8)]
     .endif
 .else
     /*
      * Assign symbolic names to registers
      */
     W           .req        r0      /* width (is updated during processing) */
     DST_W       .req        r1      /* destination buffer pointer for writes */
     SRC         .req        r2      /* source buffer pointer */
@@ -1039,41 +1057,42 @@ 7:
     /* Process the remaining trailing pixels in the scanline (dst aligned) */
     process_trailing_pixels 0, 1, \
                             process_pixblock_head, \
                             process_pixblock_tail, \
                             process_pixblock_tail_head
 
     cleanup
 .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
 .else
     bx          lr  /* exit */
 .endif
 8:
     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
     process_trailing_pixels 0, 0, \
                             process_pixblock_head, \
                             process_pixblock_tail, \
                             process_pixblock_tail_head
 
     cleanup
 
 .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
 
     .unreq      DST_R
     .unreq      SRC
     .unreq      W
     .unreq      VX
     .unreq      UNIT_X
     .unreq      TMP1
     .unreq      TMP2
     .unreq      DST_W
     .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
 
 .else
     bx          lr  /* exit */
 
     .unreq      SRC
     .unreq      MASK
     .unreq      DST_R
     .unreq      DST_W
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
@@ -350,59 +350,67 @@ 0:	add	sp, sp, #28
  *                              remaining before the end of scanline
  */
 
 .macro generate_nearest_scanline_func fname, bpp_shift, t,      \
                                       prefetch_distance,        \
                                       prefetch_braking_distance
 
 pixman_asm_function fname
-	W	.req	r0
-	DST	.req	r1
-	SRC	.req	r2
-	VX	.req	r3
-	UNIT_X	.req	ip
-	TMP1	.req	r4
-	TMP2	.req	r5
-	VXMASK	.req	r6
-	PF_OFFS	.req	r7
+	W		.req	r0
+	DST		.req	r1
+	SRC		.req	r2
+	VX		.req	r3
+	UNIT_X		.req	ip
+	TMP1		.req	r4
+	TMP2		.req	r5
+	VXMASK		.req	r6
+	PF_OFFS		.req	r7
+	SRC_WIDTH_FIXED	.req	r8
 
 	ldr	UNIT_X, [sp]
-	push	{r4, r5, r6, r7}
+	push	{r4, r5, r6, r7, r8, r10}
 	mvn	VXMASK, #((1 << bpp_shift) - 1)
+	ldr	SRC_WIDTH_FIXED, [sp, #28]
 
 	/* define helper macro */
 	.macro	scale_2_pixels
 		ldr&t	TMP1, [SRC, TMP1]
-		and	TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
-		add	VX, VX, UNIT_X
+		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
 		str&t	TMP1, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
 
 		ldr&t	TMP2, [SRC, TMP2]
-		and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-		add	VX, VX, UNIT_X
+		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
 		str&t	TMP2, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
 	.endm
 
 	/* now do the scaling */
-	and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-	add	VX, VX, UNIT_X
+	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+	adds	VX, VX, UNIT_X
+9:	subpls	VX, VX, SRC_WIDTH_FIXED
+	bpl	9b
 	subs	W, W, #(8 + prefetch_braking_distance)
 	blt	2f
 	/* calculate prefetch offset */
 	mov	PF_OFFS, #prefetch_distance
 	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
 1:	/* main loop, process 8 pixels per iteration with prefetch */
-	subs	W, W, #8
+	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
 	add	PF_OFFS, UNIT_X, lsl #3
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
-	pld	[SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+	subs	W, W, #8
 	bge	1b
 2:
 	subs	W, W, #(4 - 8 - prefetch_braking_distance)
 	blt	2f
 1:	/* process the remaining pixels */
 	scale_2_pixels
 	scale_2_pixels
 	subs	W, W, #4
@@ -421,18 +429,19 @@ 2:
 	.unreq	SRC
 	.unreq	W
 	.unreq	VX
 	.unreq	UNIT_X
 	.unreq	TMP1
 	.unreq	TMP2
 	.unreq	VXMASK
 	.unreq	PF_OFFS
+	.unreq  SRC_WIDTH_FIXED
 	/* return */
-	pop	{r4, r5, r6, r7}
+	pop	{r4, r5, r6, r7, r8, r10}
 	bx	lr
 .endfunc
 .endm
 
 generate_nearest_scanline_func \
     pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
 
 generate_nearest_scanline_func \
--- a/gfx/cairo/libpixman/src/pixman-fast-path.c
+++ b/gfx/cairo/libpixman/src/pixman-fast-path.c
@@ -1367,40 +1367,40 @@ scaled_nearest_scanline_565_565_SRC (uin
 				     pixman_fixed_t   vx,
 				     pixman_fixed_t   unit_x,
 				     pixman_fixed_t   max_vx,
 				     pixman_bool_t    fully_transparent_src)
 {
     uint16_t tmp1, tmp2, tmp3, tmp4;
     while ((w -= 4) >= 0)
     {
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
+	tmp3 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
+	tmp4 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
 	*dst++ = tmp1;
 	*dst++ = tmp2;
 	*dst++ = tmp3;
 	*dst++ = tmp4;
     }
     if (w & 2)
     {
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
 	*dst++ = tmp1;
 	*dst++ = tmp2;
     }
     if (w & 1)
-	*dst++ = src[pixman_fixed_to_int (vx)];
+	*dst = *(src + pixman_fixed_to_int (vx));
 }
 
 FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
 		       scaled_nearest_scanline_565_565_SRC,
 		       uint16_t, uint16_t, COVER)
 FAST_NEAREST_MAINLOOP (565_565_none_SRC,
 		       scaled_nearest_scanline_565_565_SRC,
 		       uint16_t, uint16_t, NONE)
--- a/gfx/cairo/libpixman/src/pixman-inlines.h
+++ b/gfx/cairo/libpixman/src/pixman-inlines.h
@@ -296,51 +296,51 @@ pad_repeat_get_scanline_bounds (int32_t 
 #define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
 			      src_type_t, dst_type_t, OP, repeat_mode)				\
 static force_inline void									\
 scanline_func_name (dst_type_t       *dst,							\
 		    const src_type_t *src,							\
 		    int32_t           w,							\
 		    pixman_fixed_t    vx,							\
 		    pixman_fixed_t    unit_x,							\
-		    pixman_fixed_t    max_vx,							\
+		    pixman_fixed_t    src_width_fixed,						\
 		    pixman_bool_t     fully_transparent_src)					\
 {												\
 	uint32_t   d;										\
 	src_type_t s1, s2;									\
 	uint8_t    a1, a2;									\
 	int        x1, x2;									\
 												\
 	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
 	    return;										\
 												\
 	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
 	    abort();										\
 												\
 	while ((w -= 2) >= 0)									\
 	{											\
-	    x1 = vx >> 16;									\
+	    x1 = pixman_fixed_to_int (vx);							\
 	    vx += unit_x;									\
 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    {											\
 		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
 	    }											\
-	    s1 = src[x1];									\
+	    s1 = *(src + x1);									\
 												\
-	    x2 = vx >> 16;									\
+	    x2 = pixman_fixed_to_int (vx);							\
 	    vx += unit_x;									\
 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    {											\
 		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
 	    }											\
-	    s2 = src[x2];									\
+	    s2 = *(src + x2);									\
 												\
 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
 	    {											\
 		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
 		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
 												\
 		if (a1 == 0xff)									\
 		{										\
@@ -374,18 +374,18 @@ scanline_func_name (dst_type_t       *ds
 	    {											\
 		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
 		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
 	    }											\
 	}											\
 												\
 	if (w & 1)										\
 	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
+	    x1 = pixman_fixed_to_int (vx);							\
+	    s1 = *(src + x1);									\
 												\
 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
 	    {											\
 		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
 												\
 		if (a1 == 0xff)									\
 		{										\
 		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
@@ -413,17 +413,17 @@ static void											\
 fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
 						   pixman_composite_info_t *info)               \
 {												\
     PIXMAN_COMPOSITE_ARGS (info);					                        \
     dst_type_t *dst_line;						                        \
     mask_type_t *mask_line;									\
     src_type_t *src_first_line;									\
     int       y;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
     pixman_fixed_t max_vy;									\
     pixman_vector_t v;										\
     pixman_fixed_t vx, vy;									\
     pixman_fixed_t unit_x, unit_y;								\
     int32_t left_pad, right_pad;								\
 												\
     src_type_t *src;										\
     dst_type_t *dst;										\
@@ -459,21 +459,20 @@ fast_composite_scaled_nearest  ## scale_
     v.vector[0] -= pixman_fixed_e;								\
     v.vector[1] -= pixman_fixed_e;								\
 												\
     vx = v.vector[0];										\
     vy = v.vector[1];										\
 												\
     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
     {												\
+	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
+												\
 	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
     }												\
 												\
     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
 	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
     {												\
 	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
 					&width, &left_pad, &right_pad);				\
@@ -485,68 +484,76 @@ fast_composite_scaled_nearest  ## scale_
 	dst = dst_line;										\
 	dst_line += dst_stride;									\
 	if (have_mask && !mask_is_solid)							\
 	{											\
 	    mask = mask_line;									\
 	    mask_line += mask_stride;								\
 	}											\
 												\
-	y = vy >> 16;										\
+	y = pixman_fixed_to_int (vy);								\
 	vy += unit_y;										\
 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
 	{											\
 	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
 	    src = src_first_line + src_stride * y;						\
 	    if (left_pad > 0)									\
 	    {											\
-		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
+		scanline_func (mask, dst,							\
+			       src + src_image->bits.width - src_image->bits.width + 1,		\
+			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
 	    }											\
 	    if (width > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
 	    }											\
 	    if (right_pad > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, src + src_image->bits.width - 1,		\
-			       right_pad, 0, 0, 0, FALSE);					\
+			       dst + left_pad + width, src + src_image->bits.width,		\
+			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
 	    }											\
 	}											\
 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
 	{											\
 	    static const src_type_t zero[1] = { 0 };						\
 	    if (y < 0 || y >= src_image->bits.height)						\
 	    {											\
-		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
+		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
 		continue;									\
 	    }											\
 	    src = src_first_line + src_stride * y;						\
 	    if (left_pad > 0)									\
 	    {											\
-		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
+		scanline_func (mask, dst, zero + 1, left_pad,					\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
 	    }											\
 	    if (width > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
 	    }											\
 	    if (right_pad > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
+			       dst + left_pad + width, zero + 1, right_pad,			\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
 	    }											\
 	}											\
 	else											\
 	{											\
 	    src = src_first_line + src_stride * y;						\
-	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
+	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
+			   unit_x, src_width_fixed, FALSE);					\
 	}											\
     }												\
 }
 
 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
 #define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
 	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
--- a/gfx/cairo/libpixman/src/pixman-sse2.c
+++ b/gfx/cairo/libpixman/src/pixman-sse2.c
@@ -5073,55 +5073,65 @@ sse2_composite_over_8888_8888_8888 (pixm
 
 /* A variant of 'sse2_combine_over_u' with minor tweaks */
 static force_inline void
 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                              const uint32_t* ps,
                                              int32_t         w,
                                              pixman_fixed_t  vx,
                                              pixman_fixed_t  unit_x,
-                                             pixman_fixed_t  max_vx,
+                                             pixman_fixed_t  src_width_fixed,
                                              pixman_bool_t   fully_transparent_src)
 {
     uint32_t s, d;
     const uint32_t* pm = NULL;
 
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
     if (fully_transparent_src)
 	return;
 
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
 	d = *pd;
-	s = combine1 (ps + (vx >> 16), pm);
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
 	if (pm)
 	    pm++;
 	w--;
     }
 
     while (w >= 4)
     {
 	__m128i tmp;
 	uint32_t tmp1, tmp2, tmp3, tmp4;
 
-	tmp1 = ps[vx >> 16];
+	tmp1 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = ps[vx >> 16];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = ps[vx >> 16];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = ps[vx >> 16];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
 
 	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
 
 	if (is_opaque (xmm_src_hi))
 	{
 	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
@@ -5149,18 +5159,20 @@ scaled_nearest_scanline_sse2_8888_8888_O
 	pd += 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
 	d = *pd;
-	s = combine1 (ps + (vx >> 16), pm);
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
 	if (pm)
 	    pm++;
 
 	w--;
     }
 }
@@ -5169,41 +5181,46 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_co
 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
 		       uint32_t, uint32_t, COVER)
 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
 		       uint32_t, uint32_t, NONE)
 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
 		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
 
 static force_inline void
 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 					       uint32_t *       dst,
 					       const uint32_t * src,
 					       int32_t          w,
 					       pixman_fixed_t   vx,
 					       pixman_fixed_t   unit_x,
-					       pixman_fixed_t   max_vx,
+					       pixman_fixed_t   src_width_fixed,
 					       pixman_bool_t    zero_src)
 {
     __m128i xmm_mask;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
     if (zero_src || (*mask >> 24) == 0)
 	return;
 
     xmm_mask = create_mask_16_128 (*mask >> 24);
 
     while (w && (unsigned long)dst & 15)
     {
-	uint32_t s = src[pixman_fixed_to_int (vx)];
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	if (s)
 	{
 	    uint32_t d = *dst;
 
 	    __m128i ms = unpack_32_1x128 (s);
 	    __m128i alpha     = expand_alpha_1x128 (ms);
 	    __m128i dest      = xmm_mask;
@@ -5215,24 +5232,32 @@ scaled_nearest_scanline_sse2_8888_n_8888
 	dst++;
 	w--;
     }
 
     while (w >= 4)
     {
 	uint32_t tmp1, tmp2, tmp3, tmp4;
 
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
 
 	if (!is_zero (xmm_src))
 	{
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
@@ -5250,18 +5275,20 @@ scaled_nearest_scanline_sse2_8888_n_8888
 	}
 
 	dst += 4;
 	w -= 4;
     }
 
     while (w)
     {
-	uint32_t s = src[pixman_fixed_to_int (vx)];
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	if (s)
 	{
 	    uint32_t d = *dst;
 
 	    __m128i ms = unpack_32_1x128 (s);
 	    __m128i alpha = expand_alpha_1x128 (ms);
 	    __m128i mask  = xmm_mask;
@@ -5281,16 +5308,19 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
 #define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
     const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
     const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
 					  unit_x, unit_x, unit_x, unit_x);	\
@@ -5750,21 +5780,29 @@ static const pixman_fast_path_t sse2_fas
     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),