author | Siarhei Siamashka <siarhei.siamashka@nokia.com> |
Sat, 12 May 2012 10:01:17 -0400 | |
changeset 93845 | d61928d439b4e9335f3bd1464ed9bf3a219129b8 |
parent 93839 | c5023518db2f092fc7320f530ee4c33c9778e751 |
child 93846 | c216e50bdc0d7420b678bbf71dabd2e6baa87e82 |
push id | 22681 |
push user | wmccloskey@mozilla.com |
push date | Mon, 14 May 2012 00:26:22 +0000 |
treeherder | mozilla-central@ac968ff4fe41 [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | jrmuizel |
bugs | 754364 |
milestone | 15.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/gfx/cairo/libpixman/src/pixman-arm-common.h +++ b/gfx/cairo/libpixman/src/pixman-arm-common.h @@ -231,90 +231,102 @@ cputype##_composite_##name (pixman_imple #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op, \ src_type, dst_type) \ void \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ int32_t w, \ dst_type * dst, \ const src_type * src, \ pixman_fixed_t vx, \ - pixman_fixed_t unit_x); \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx); \ \ static force_inline void \ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \ const src_type * ps, \ int32_t w, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ pixman_fixed_t max_vx, \ pixman_bool_t zero_src) \ { \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ - vx, unit_x);\ + vx, unit_x, \ + max_vx); \ } \ \ FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op, \ src_type, dst_type, COVER) \ FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op, \ src_type, dst_type, NONE) \ FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op, \ - src_type, dst_type, PAD) + src_type, dst_type, PAD) \ +FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op, \ + src_type, dst_type, NORMAL) /* Provide entries for the fast path table */ #define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func) + SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \ src_type, dst_type) \ void \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ int32_t w, \ dst_type * dst, \ const src_type * src, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ const uint8_t * mask); \ \ static force_inline void \ scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \ dst_type * pd, \ const src_type * ps, \ int32_t w, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ pixman_fixed_t max_vx, \ pixman_bool_t zero_src) \ { \ if ((flags & SKIP_ZERO_SRC) && zero_src) \ return; \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ vx, unit_x, \ + max_vx, \ mask); \ } \ \ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op,\ src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op,\ src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op,\ - src_type, uint8_t, dst_type, PAD, TRUE, FALSE) + src_type, uint8_t, dst_type, PAD, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE) /* Provide entries for the fast path table */ #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func) /*****************************************************************************/ #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op, \ src_type, dst_type) \ void \ pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ dst_type * dst, \
--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h @@ -207,47 +207,59 @@ /* * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register * aliases to be defined) */ .macro pixld1_s elem_size, reg1, mem_operand .if elem_size == 16 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #1 mov TMP2, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP2, mem_operand, TMP2, asl #1 vld1.16 {d®1&[0]}, [TMP1, :16] mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #1 vld1.16 {d®1&[1]}, [TMP2, :16] mov TMP2, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP2, mem_operand, TMP2, asl #1 vld1.16 {d®1&[2]}, [TMP1, :16] vld1.16 {d®1&[3]}, [TMP2, :16] .elseif elem_size == 32 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #2 mov TMP2, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP2, mem_operand, TMP2, asl #2 vld1.32 {d®1&[0]}, [TMP1, :32] vld1.32 {d®1&[1]}, [TMP2, :32] .else .error "unsupported" .endif .endm .macro pixld2_s elem_size, reg1, reg2, mem_operand -.if elem_size == 32 +.if 0 /* elem_size == 32 */ mov TMP1, VX, asr #16 add VX, VX, UNIT_X, asl #1 add TMP1, mem_operand, TMP1, asl #2 mov TMP2, VX, asr #16 sub VX, VX, UNIT_X add TMP2, mem_operand, TMP2, asl #2 vld1.32 {d®1&[0]}, [TMP1, :32] mov TMP1, VX, asr #16 @@ -263,22 +275,26 @@ pixld1_s elem_size, reg1, mem_operand pixld1_s elem_size, reg2, mem_operand .endif .endm .macro pixld0_s elem_size, reg1, idx, mem_operand .if elem_size == 16 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #1 vld1.16 {d®1&[idx]}, [TMP1, :16] .elseif elem_size == 32 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #2 vld1.32 {d®1&[idx]}, [TMP1, :32] .endif .endm .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand .if numbytes == 32 pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand @@ -959,25 +975,27 @@ fname: DST_W .req r1 SRC .req r2 VX .req r3 UNIT_X .req ip MASK .req lr TMP1 .req r4 TMP2 .req r5 DST_R .req r6 + SRC_WIDTH_FIXED .req r7 .macro pixld_src x:vararg pixld_s x .endm ldr UNIT_X, [sp] - push {r4-r6, lr} + push {r4-r8, lr} + ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] .if mask_bpp != 0 - ldr MASK, [sp, #(16 + 4)] + ldr MASK, [sp, #(24 + 8)] .endif .else /* * Assign symbolic names to registers */ W .req r0 /* width (is updated during processing) */ DST_W .req r1 /* destination buffer pointer for writes */ SRC .req r2 /* source buffer pointer */ @@ -1039,41 +1057,42 @@ 7: /* Process the remaining trailing pixels in the scanline (dst aligned) */ process_trailing_pixels 0, 1, \ process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head cleanup .if use_nearest_scaling != 0 - pop {r4-r6, pc} /* exit */ + pop {r4-r8, pc} /* exit */ .else bx lr /* exit */ .endif 8: /* Process the remaining trailing pixels in the scanline (dst unaligned) */ process_trailing_pixels 0, 0, \ process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head cleanup .if use_nearest_scaling != 0 - pop {r4-r6, pc} /* exit */ + pop {r4-r8, pc} /* exit */ .unreq DST_R .unreq SRC .unreq W .unreq VX .unreq UNIT_X .unreq TMP1 .unreq TMP2 .unreq DST_W .unreq MASK + .unreq SRC_WIDTH_FIXED .else bx lr /* exit */ .unreq SRC .unreq MASK .unreq DST_R .unreq DST_W
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S @@ -350,59 +350,67 @@ 0: add sp, sp, #28 * remaining before the end of scanline */ .macro generate_nearest_scanline_func fname, bpp_shift, t, \ prefetch_distance, \ prefetch_braking_distance pixman_asm_function fname - W .req r0 - DST .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - TMP1 .req r4 - TMP2 .req r5 - VXMASK .req r6 - PF_OFFS .req r7 + W .req r0 + DST .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + TMP1 .req r4 + TMP2 .req r5 + VXMASK .req r6 + PF_OFFS .req r7 + SRC_WIDTH_FIXED .req r8 ldr UNIT_X, [sp] - push {r4, r5, r6, r7} + push {r4, r5, r6, r7, r8, r10} mvn VXMASK, #((1 << bpp_shift) - 1) + ldr SRC_WIDTH_FIXED, [sp, #28] /* define helper macro */ .macro scale_2_pixels ldr&t TMP1, [SRC, TMP1] - and TMP2, VXMASK, VX, lsr #(16 - bpp_shift) - add VX, VX, UNIT_X + and TMP2, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X str&t TMP1, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b ldr&t TMP2, [SRC, TMP2] - and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) - add VX, VX, UNIT_X + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X str&t TMP2, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b .endm /* now do the scaling */ - and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) - add VX, VX, UNIT_X + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b subs W, W, #(8 + prefetch_braking_distance) blt 2f /* calculate prefetch offset */ mov PF_OFFS, #prefetch_distance mla PF_OFFS, UNIT_X, PF_OFFS, VX 1: /* main loop, process 8 pixels per iteration with prefetch */ - subs W, W, #8 + pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] add PF_OFFS, UNIT_X, lsl #3 scale_2_pixels scale_2_pixels scale_2_pixels scale_2_pixels - pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)] + subs W, W, #8 bge 1b 2: subs W, W, #(4 - 8 - prefetch_braking_distance) blt 2f 1: /* process the remaining pixels */ scale_2_pixels scale_2_pixels subs W, W, #4 @@ -421,18 +429,19 @@ 2: .unreq SRC .unreq W .unreq VX .unreq UNIT_X .unreq TMP1 .unreq TMP2 .unreq VXMASK .unreq PF_OFFS + .unreq SRC_WIDTH_FIXED /* return */ - pop {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r10} bx lr .endfunc .endm generate_nearest_scanline_func \ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 generate_nearest_scanline_func \
--- a/gfx/cairo/libpixman/src/pixman-fast-path.c +++ b/gfx/cairo/libpixman/src/pixman-fast-path.c @@ -1367,40 +1367,40 @@ scaled_nearest_scanline_565_565_SRC (uin pixman_fixed_t vx, pixman_fixed_t unit_x, pixman_fixed_t max_vx, pixman_bool_t fully_transparent_src) { uint16_t tmp1, tmp2, tmp3, tmp4; while ((w -= 4) >= 0) { - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; + tmp3 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; + tmp4 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; *dst++ = tmp1; *dst++ = tmp2; *dst++ = tmp3; *dst++ = tmp4; } if (w & 2) { - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; *dst++ = tmp1; *dst++ = tmp2; } if (w & 1) - *dst++ = src[pixman_fixed_to_int (vx)]; + *dst = *(src + pixman_fixed_to_int (vx)); } FAST_NEAREST_MAINLOOP (565_565_cover_SRC, scaled_nearest_scanline_565_565_SRC, uint16_t, uint16_t, COVER) FAST_NEAREST_MAINLOOP (565_565_none_SRC, scaled_nearest_scanline_565_565_SRC, uint16_t, uint16_t, NONE)
--- a/gfx/cairo/libpixman/src/pixman-inlines.h +++ b/gfx/cairo/libpixman/src/pixman-inlines.h @@ -296,51 +296,51 @@ pad_repeat_get_scanline_bounds (int32_t #define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \ src_type_t, dst_type_t, OP, repeat_mode) \ static force_inline void \ scanline_func_name (dst_type_t *dst, \ const src_type_t *src, \ int32_t w, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx, \ + pixman_fixed_t src_width_fixed, \ pixman_bool_t fully_transparent_src) \ { \ uint32_t d; \ src_type_t s1, s2; \ uint8_t a1, a2; \ int x1, x2; \ \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src) \ return; \ \ if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \ abort(); \ \ while ((w -= 2) >= 0) \ { \ - x1 = vx >> 16; \ + x1 = pixman_fixed_to_int (vx); \ vx += unit_x; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ { \ /* This works because we know that unit_x is positive */ \ - while (vx >= max_vx) \ - vx -= max_vx; \ + while (vx >= 0) \ + vx -= src_width_fixed; \ } \ - s1 = src[x1]; \ + s1 = *(src + x1); \ \ - x2 = vx >> 16; \ + x2 = pixman_fixed_to_int (vx); \ vx += unit_x; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ { \ /* This works because we know that unit_x is positive */ \ - while (vx >= max_vx) \ - vx -= max_vx; \ + while (vx >= 0) \ + vx -= src_width_fixed; \ } \ - s2 = src[x2]; \ + s2 = *(src + x2); \ \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ { \ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \ \ if (a1 == 0xff) \ { \ @@ -374,18 +374,18 @@ scanline_func_name (dst_type_t *ds { \ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ } \ } \ \ if (w & 1) \ { \ - x1 = vx >> 16; \ - s1 = src[x1]; \ + x1 = pixman_fixed_to_int (vx); \ + s1 = *(src + x1); \ \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ { \ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ \ if (a1 == 0xff) \ { \ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ @@ -413,17 +413,17 @@ static void \ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \ pixman_composite_info_t *info) \ { \ PIXMAN_COMPOSITE_ARGS (info); \ dst_type_t *dst_line; \ mask_type_t *mask_line; \ src_type_t *src_first_line; \ int y; \ - pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \ + pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width); \ pixman_fixed_t max_vy; \ pixman_vector_t v; \ pixman_fixed_t vx, vy; \ pixman_fixed_t unit_x, unit_y; \ int32_t left_pad, right_pad; \ \ src_type_t *src; \ dst_type_t *dst; \ @@ -459,21 +459,20 @@ fast_composite_scaled_nearest ## scale_ v.vector[0] -= pixman_fixed_e; \ v.vector[1] -= pixman_fixed_e; \ \ vx = v.vector[0]; \ vy = v.vector[1]; \ \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ { \ + max_vy = pixman_int_to_fixed (src_image->bits.height); \ + \ /* Clamp repeating positions inside the actual samples */ \ - max_vx = src_image->bits.width << 16; \ - max_vy = src_image->bits.height << 16; \ - \ - repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \ + repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed); \ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ } \ \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \ PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ { \ pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \ &width, &left_pad, &right_pad); \ @@ -485,68 +484,76 @@ fast_composite_scaled_nearest ## scale_ dst = dst_line; \ dst_line += dst_stride; \ if (have_mask && !mask_is_solid) \ { \ mask = mask_line; \ mask_line += mask_stride; \ } \ \ - y = vy >> 16; \ + y = pixman_fixed_to_int (vy); \ vy += unit_y; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ { \ repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \ src = src_first_line + src_stride * y; \ if (left_pad > 0) \ { \ - scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE); \ + scanline_func (mask, dst, \ + src + src_image->bits.width - src_image->bits.width + 1, \ + left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE); \ } \ if (width > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ - dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + dst + left_pad, src + src_image->bits.width, width, \ + vx - src_width_fixed, unit_x, src_width_fixed, FALSE); \ } \ if (right_pad > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ - dst + left_pad + width, src + src_image->bits.width - 1, \ - right_pad, 0, 0, 0, FALSE); \ + dst + left_pad + width, src + src_image->bits.width, \ + right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE); \ } \ } \ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ { \ static const src_type_t zero[1] = { 0 }; \ if (y < 0 || y >= src_image->bits.height) \ { \ - scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE); \ + scanline_func (mask, dst, zero + 1, left_pad + width + right_pad, \ + -pixman_fixed_e, 0, src_width_fixed, TRUE); \ continue; \ } \ src = src_first_line + src_stride * y; \ if (left_pad > 0) \ { \ - scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE); \ + scanline_func (mask, dst, zero + 1, left_pad, \ + -pixman_fixed_e, 0, src_width_fixed, TRUE); \ } \ if (width > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ - dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + dst + left_pad, src + src_image->bits.width, width, \ + vx - src_width_fixed, unit_x, src_width_fixed, FALSE); \ } \ if (right_pad > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ - dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE); \ + dst + left_pad + width, zero + 1, right_pad, \ + -pixman_fixed_e, 0, src_width_fixed, TRUE); \ } \ } \ else \ { \ src = src_first_line + src_stride * y; \ - scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE); \ + scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed, \ + unit_x, src_width_fixed, FALSE); \ } \ } \ } /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ #define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ dst_type_t, repeat_mode, have_mask, mask_is_solid) \ FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t, \
--- a/gfx/cairo/libpixman/src/pixman-sse2.c +++ b/gfx/cairo/libpixman/src/pixman-sse2.c @@ -5073,55 +5073,65 @@ sse2_composite_over_8888_8888_8888 (pixm /* A variant of 'sse2_combine_over_u' with minor tweaks */ static force_inline void scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, const uint32_t* ps, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx, + pixman_fixed_t src_width_fixed, pixman_bool_t fully_transparent_src) { uint32_t s, d; const uint32_t* pm = NULL; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; if (fully_transparent_src) return; /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { d = *pd; - s = combine1 (ps + (vx >> 16), pm); + s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); if (pm) pm++; w--; } while (w >= 4) { __m128i tmp; uint32_t tmp1, tmp2, tmp3, tmp4; - tmp1 = ps[vx >> 16]; + tmp1 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); if (is_opaque (xmm_src_hi)) { save_128_aligned ((__m128i*)pd, xmm_src_hi); @@ -5149,18 +5159,20 @@ scaled_nearest_scanline_sse2_8888_8888_O pd += 4; if (pm) pm += 4; } while (w) { d = *pd; - s = combine1 (ps + (vx >> 16), pm); + s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); if (pm) pm++; w--; } } @@ -5169,41 +5181,46 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_co scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, COVER) FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, NONE) FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, NORMAL) static force_inline void scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, uint32_t * dst, const uint32_t * src, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx, + pixman_fixed_t src_width_fixed, pixman_bool_t zero_src) { __m128i xmm_mask; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; if (zero_src || (*mask >> 24) == 0) return; xmm_mask = create_mask_16_128 (*mask >> 24); while (w && (unsigned long)dst & 15) { - uint32_t s = src[pixman_fixed_to_int (vx)]; + uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; if (s) { uint32_t d = *dst; __m128i ms = unpack_32_1x128 (s); __m128i alpha = expand_alpha_1x128 (ms); __m128i dest = xmm_mask; @@ -5215,24 +5232,32 @@ scaled_nearest_scanline_sse2_8888_n_8888 dst++; w--; } while (w >= 4) { uint32_t tmp1, tmp2, tmp3, tmp4; - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); if (!is_zero (xmm_src)) { xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); @@ -5250,18 +5275,20 @@ scaled_nearest_scanline_sse2_8888_n_8888 } dst += 4; w -= 4; } while (w) { - uint32_t s = src[pixman_fixed_to_int (vx)]; + uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; if (s) { uint32_t d = *dst; __m128i ms = unpack_32_1x128 (s); __m128i alpha = expand_alpha_1x128 (ms); __m128i mask = xmm_mask; @@ -5281,16 +5308,19 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_ scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) #define BILINEAR_DECLARE_VARIABLES \ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ unit_x, unit_x, unit_x, unit_x); \ @@ -5750,21 +5780,29 @@ static const pixman_fast_path_t sse2_fas SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),