author | Jeff Muizelaar <jmuizelaar@mozilla.com> |
Mon, 18 Feb 2013 10:17:08 -0500 | |
changeset 122243 | 46897ceac25352c7c53c979703d3d664010aeb49 |
parent 122242 | eaf5f56832b4e02edd345046a2d044fd19dfb317 |
child 122244 | 5dec45f30b77f60ac6415b26c6aaa57b0e14f547 |
push id | 24327 |
push user | gszorc@mozilla.com |
push date | Tue, 19 Feb 2013 05:22:32 +0000 |
treeherder | mozilla-central@e8f8a3f6f1f6 [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
bugs | 809821 |
milestone | 21.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/gfx/cairo/libpixman/src/Makefile.in +++ b/gfx/cairo/libpixman/src/Makefile.in @@ -88,16 +88,17 @@ CSRCS = \ pixman.c \ pixman-combine16.c \ pixman-combine32.c \ pixman-combine-float.c \ pixman-conical-gradient.c \ pixman-edge.c \ pixman-edge-accessors.c \ pixman-fast-path.c \ + pixman-filter.c \ pixman-general.c \ pixman-gradient-walker.c \ pixman-glyph.c \ pixman-image.c \ pixman-implementation.c \ pixman-linear-gradient.c \ pixman-matrix.c \ pixman-mips.c \ @@ -124,17 +125,17 @@ endif ifdef USE_VMX CSRCS += pixman-vmx.c DEFINES += -DUSE_VMX endif ifdef USE_ARM_SIMD_GCC CSRCS += pixman-arm-simd.c -SSRCS += pixman-arm-simd-asm.S +SSRCS += pixman-arm-simd-asm.S pixman-arm-simd-asm-scaled.S DEFINES += -DUSE_ARM_SIMD endif ifdef USE_ARM_NEON_GCC CSRCS += pixman-arm-neon.c SSRCS += pixman-arm-neon-asm.S SSRCS += pixman-arm-neon-asm-bilinear.S DEFINES += -DUSE_ARM_NEON
--- a/gfx/cairo/libpixman/src/pixman-access.c +++ b/gfx/cairo/libpixman/src/pixman-access.c @@ -1340,18 +1340,20 @@ static const format_info_t accessors[] = FORMAT_INFO (b8g8r8a8), FORMAT_INFO (b8g8r8x8), FORMAT_INFO (r8g8b8a8), FORMAT_INFO (r8g8b8x8), FORMAT_INFO (x14r6g6b6), /* sRGB formats */ { PIXMAN_a8r8g8b8_sRGB, + NULL, fetch_scanline_a8r8g8b8_32_sRGB, fetch_scanline_a8r8g8b8_sRGB_float, fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float, + NULL, store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float, }, /* 24bpp formats */ FORMAT_INFO (r8g8b8), FORMAT_INFO (b8g8r8), /* 16bpp formats */
new file mode 100644 --- /dev/null +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S @@ -0,0 +1,165 @@ +/* + * Copyright © 2008 Mozilla Corporation + * Copyright © 2010 Nokia Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Mozilla Corporation makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname + .func fname + .global fname +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: +.endm + +/* + * Note: This code is only using armv5te instructions (not even armv6), + * but is scheduled for ARM Cortex-A8 pipeline. So it might need to + * be split into a few variants, tuned for each microarchitecture. + * + * TODO: In order to get good performance on ARM9/ARM11 cores (which don't + * have efficient write combining), it needs to be changed to use 16-byte + * aligned writes using STM instruction. + * + * Nearest scanline scaler macro template uses the following arguments: + * fname - name of the function to generate + * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes + * t - type suffix for LDR/STR instructions + * prefetch_distance - prefetch in the source image by that many + * pixels ahead + * prefetch_braking_distance - stop prefetching when that many pixels are + * remaining before the end of scanline + */ + +.macro generate_nearest_scanline_func fname, bpp_shift, t, \ + prefetch_distance, \ + prefetch_braking_distance + +pixman_asm_function fname + W .req r0 + DST .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + TMP1 .req r4 + TMP2 .req r5 + VXMASK .req r6 + PF_OFFS .req r7 + SRC_WIDTH_FIXED .req r8 + + ldr UNIT_X, [sp] + push {r4, r5, r6, r7, r8, r10} + mvn VXMASK, #((1 << bpp_shift) - 1) + ldr SRC_WIDTH_FIXED, [sp, #28] + + /* define helper macro */ + .macro scale_2_pixels + ldr&t TMP1, [SRC, TMP1] + and TMP2, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X + str&t TMP1, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + + ldr&t TMP2, [SRC, TMP2] + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X + str&t TMP2, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + .endm + + /* now do the scaling */ + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + subs W, W, #(8 + prefetch_braking_distance) + blt 2f + /* calculate prefetch offset */ + mov PF_OFFS, #prefetch_distance + mla PF_OFFS, UNIT_X, PF_OFFS, VX +1: /* main loop, process 8 pixels per iteration with prefetch */ + pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] + add PF_OFFS, UNIT_X, lsl #3 + scale_2_pixels + scale_2_pixels + scale_2_pixels + scale_2_pixels + subs W, W, #8 + bge 1b +2: + subs W, W, #(4 - 8 - prefetch_braking_distance) + blt 2f +1: /* process the remaining pixels */ + scale_2_pixels + scale_2_pixels + subs W, W, #4 + bge 1b +2: + tst W, #2 + beq 2f + scale_2_pixels +2: + tst W, #1 + ldrne&t TMP1, [SRC, TMP1] + strne&t TMP1, [DST] + /* cleanup helper macro */ + .purgem scale_2_pixels + .unreq DST + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq VXMASK + .unreq PF_OFFS + .unreq SRC_WIDTH_FIXED + /* return */ + pop {r4, r5, r6, r7, r8, r10} + bx lr +.endfunc +.endm + +generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 + +generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S @@ -1,448 +1,613 @@ /* - * Copyright © 2008 Mozilla Corporation - * Copyright © 2010 Nokia Corporation + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in + * documentation, and that the name of the copyright holders not be used in * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no + * specific, written prior permission. The copyright holders make no * representations about the suitability of this software for any purpose. It * is provided "as is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * - * Author: Jeff Muizelaar (jeff@infidigm.net) + * Author: Ben Avison (bavison@riscosopen.org) * */ /* Prevent the stack from becoming executable */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .arch armv6 .object_arch armv4 .arm .altmacro .p2align 2 -/* Supplementary macro for setting function attributes */ -.macro pixman_asm_function fname - .func fname - .global fname -#ifdef __ELF__ - .hidden fname - .type fname, %function -#endif -fname: -.endm +#include "pixman-arm-simd-asm.h" -/* - * The code below was generated by gcc 4.3.4 from the commented out - * functions in 'pixman-arm-simd.c' file with the following optimization - * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" - * - * TODO: replace gcc generated code with hand tuned versions because - * the code quality is not very good, introduce symbolic register - * aliases for better readability and maintainability. +/* A head macro should do all processing which results in an output of up to + * 16 bytes, as far as the final load instruction. The corresponding tail macro + * should complete the processing of the up-to-16 bytes. The calling macro will + * sometimes choose to insert a preload or a decrement of X between them. + * cond ARM condition code for code block + * numbytes Number of output bytes that should be generated this time + * firstreg First WK register in which to place output + * unaligned_src Whether to use non-wordaligned loads of source image + * unaligned_mask Whether to use non-wordaligned loads of mask image + * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output */ -pixman_asm_function pixman_composite_add_8_8_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - mov r10, r1 - sub sp, sp, #4 - subs r10, r10, #1 - mov r11, r0 - mov r8, r2 - str r3, [sp] - ldr r7, [sp, #36] - bcc 0f -6: cmp r11, #0 - beq 1f - orr r3, r8, r7 - tst r3, #3 - beq 2f - mov r1, r8 - mov r0, r7 - mov r12, r11 - b 3f -5: tst r3, #3 - beq 4f -3: ldrb r2, [r0], #1 - subs r12, r12, #1 - ldrb r3, [r1] - uqadd8 r3, r2, r3 - strb r3, [r1], #1 - orr r3, r1, r0 - bne 5b -1: ldr r3, [sp] - add r8, r8, r3 - ldr r3, [sp, #40] - add r7, r7, r3 -10: subs r10, r10, #1 - bcs 6b -0: add sp, sp, #4 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -2: mov r12, r11 - mov r1, r8 - mov r0, r7 -4: cmp r12, #3 - subgt r6, r12, #4 - movgt r9, r12 - lsrgt r5, r6, #2 - addgt r3, r5, #1 - movgt r12, #0 - lslgt r4, r3, #2 - ble 7f -8: ldr r3, [r0, r12] - ldr r2, [r1, r12] - uqadd8 r3, r3, r2 - str r3, [r1, r12] - add r12, r12, #4 - cmp r12, r4 - bne 8b - sub r3, r9, #4 - bic r3, r3, #3 - add r3, r3, #4 - subs r12, r6, r5, lsl #2 - add r1, r1, r3 - add r0, r0, r3 - beq 1b -7: mov r4, #0 -9: ldrb r3, [r1, r4] - ldrb r2, [r0, r4] - uqadd8 r3, r2, r3 - strb r3, [r1, r4] - add r4, r4, #1 - cmp r4, r12 - bne 9b - ldr r3, [sp] - add r8, r8, r3 - ldr r3, [sp, #40] - add r7, r7, r3 - b 10b -.endfunc +.macro blit_init + line_saved_regs STRIDE_D, STRIDE_S +.endm + +.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M +110: pixld , 16, 0, SRC, unaligned_src + pixld , 16, 4, SRC, unaligned_src + pld [SRC, SCRATCH] + pixst , 16, 0, DST + pixst , 16, 4, DST + subs X, X, #32*8/src_bpp + bhs 110b + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 4, /* prefetch distance */ \ + blit_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + blit_process_head, \ + nop_macro, /* process tail */ \ + blit_inner_loop + +generate_composite_function \ + pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 4, /* prefetch distance */ \ + blit_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + blit_process_head, \ + nop_macro, /* process tail */ \ + blit_inner_loop + +generate_composite_function \ + pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 3, /* prefetch distance */ \ + blit_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + blit_process_head, \ + nop_macro, /* process tail */ \ + blit_inner_loop + +/******************************************************************************/ + +.macro src_n_8888_init + ldr SRC, [sp, #ARGS_STACK_OFFSET] + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm -pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #20 - cmp r1, #0 - mov r12, r2 - str r1, [sp, #12] - str r0, [sp, #16] - ldr r2, [sp, #52] - beq 0f - lsl r3, r3, #2 - str r3, [sp] - ldr r3, [sp, #56] - mov r10, #0 - lsl r3, r3, #2 - str r3, [sp, #8] - mov r11, r3 - b 1f -6: ldr r11, [sp, #8] -1: ldr r9, [sp] - mov r0, r12 - add r12, r12, r9 - mov r1, r2 - str r12, [sp, #4] - add r2, r2, r11 - ldr r12, [sp, #16] - ldr r3, =0x00800080 - ldr r9, =0xff00ff00 - mov r11, #255 - cmp r12, #0 - beq 4f -5: ldr r5, [r1], #4 - ldr r4, [r0] - sub r8, r11, r5, lsr #24 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - mla r6, r6, r8, r3 - mla r7, r7, r8, r3 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - and r7, r7, r9 - uxtab16 r6, r7, r6, ror #8 - uqadd8 r5, r6, r5 - str r5, [r0], #4 - subs r12, r12, #1 - bne 5b -4: ldr r3, [sp, #12] - add r10, r10, #1 - cmp r10, r3 - ldr r12, [sp, #4] - bne 6b -0: add sp, sp, #20 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc +.macro src_n_0565_init + ldrh SRC, [sp, #ARGS_STACK_OFFSET] + orr SRC, SRC, lsl #16 + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro src_n_8_init + ldrb SRC, [sp, #ARGS_STACK_OFFSET] + orr SRC, SRC, lsl #8 + orr SRC, SRC, lsl #16 + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro fill_process_tail cond, numbytes, firstreg + WK4 .req SRC + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M + pixst cond, numbytes, 4, DST + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + src_n_8888_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + +generate_composite_function \ + pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + src_n_0565_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + +generate_composite_function \ + pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + src_n_8_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + +/******************************************************************************/ + +.macro src_x888_8888_pixel, cond, reg + orr&cond WK®, WK®, #0xFF000000 +.endm + +.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes, firstreg, SRC, unaligned_src +.endm -pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #28 - cmp r1, #0 - str r1, [sp, #12] - ldrb r1, [sp, #71] - mov r12, r2 - str r0, [sp, #16] - ldr r2, [sp, #60] - str r1, [sp, #24] - beq 0f - lsl r3, r3, #2 - str r3, [sp, #20] - ldr r3, [sp, #64] - mov r10, #0 - lsl r3, r3, #2 - str r3, [sp, #8] - mov r11, r3 - b 1f -5: ldr r11, [sp, #8] -1: ldr r4, [sp, #20] - mov r0, r12 - mov r1, r2 - add r12, r12, r4 - add r2, r2, r11 - str r12, [sp] - str r2, [sp, #4] - ldr r12, [sp, #16] - ldr r2, =0x00800080 - ldr r3, [sp, #24] - mov r11, #255 - cmp r12, #0 - beq 3f -4: ldr r5, [r1], #4 - ldr r4, [r0] - uxtb16 r6, r5 - uxtb16 r7, r5, ror #8 - mla r6, r6, r3, r2 - mla r7, r7, r3, r2 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r5, r6, r7, lsl #8 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - sub r8, r11, r5, lsr #24 - mla r6, r6, r8, r2 - mla r7, r7, r8, r2 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r6, r6, r7, lsl #8 - uqadd8 r5, r6, r5 - str r5, [r0], #4 - subs r12, r12, #1 - bne 4b -3: ldr r1, [sp, #12] - add r10, r10, #1 - cmp r10, r1 - ldr r12, [sp] - ldr r2, [sp, #4] - bne 5b -0: add sp, sp, #28 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc +.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg + src_x888_8888_pixel cond, %(firstreg+0) + .if numbytes >= 8 + src_x888_8888_pixel cond, %(firstreg+1) + .if numbytes == 16 + src_x888_8888_pixel cond, %(firstreg+2) + src_x888_8888_pixel cond, %(firstreg+3) + .endif + .endif +.endm + +generate_composite_function \ + pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 3, /* prefetch distance */ \ + nop_macro, /* init */ \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + pixman_composite_src_x888_8888_process_head, \ + pixman_composite_src_x888_8888_process_tail + +/******************************************************************************/ + +.macro src_0565_8888_init + /* Hold loop invariants in MASK and STRIDE_M */ + ldr MASK, =0x07E007E0 + mov STRIDE_M, #0xFF000000 + /* Set GE[3:0] to 1010 so SEL instructions do what we want */ + ldr SCRATCH, =0x80008000 + uadd8 SCRATCH, SCRATCH, SCRATCH +.endm + +.macro src_0565_8888_2pixels, reg1, reg2 + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG + bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 + orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 + orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 + pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- + sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg + pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- + sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- + orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb + orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB +.endm -pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #28 - cmp r1, #0 - ldr r9, [sp, #60] - str r1, [sp, #12] - bic r1, r9, #-16777216 - str r1, [sp, #20] - mov r12, r2 - lsr r1, r9, #8 - ldr r2, [sp, #20] - bic r1, r1, #-16777216 - bic r2, r2, #65280 - bic r1, r1, #65280 - str r2, [sp, #20] - str r0, [sp, #16] - str r1, [sp, #4] - ldr r2, [sp, #68] - beq 0f - lsl r3, r3, #2 - str r3, [sp, #24] - mov r0, #0 - b 1f -5: ldr r3, [sp, #24] -1: ldr r4, [sp, #72] - mov r10, r12 - mov r1, r2 - add r12, r12, r3 - add r2, r2, r4 - str r12, [sp, #8] - str r2, [sp] - ldr r12, [sp, #16] - ldr r11, =0x00800080 - ldr r2, [sp, #4] - ldr r3, [sp, #20] - cmp r12, #0 - beq 3f -4: ldrb r5, [r1], #1 - ldr r4, [r10] - mla r6, r3, r5, r11 - mla r7, r2, r5, r11 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r5, r6, r7, lsl #8 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - mvn r8, r5 - lsr r8, r8, #24 - mla r6, r6, r8, r11 - mla r7, r7, r8, r11 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r6, r6, r7, lsl #8 - uqadd8 r5, r6, r5 - str r5, [r10], #4 - subs r12, r12, #1 - bne 4b -3: ldr r4, [sp, #12] - add r0, r0, #1 - cmp r0, r4 - ldr r12, [sp, #8] - ldr r2, [sp] - bne 5b -0: add sp, sp, #28 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc +/* This version doesn't need STRIDE_M, but is one instruction longer. + It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 + bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb + mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 + mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB + orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB + pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB + sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb + orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb +*/ + +.macro src_0565_8888_1pixel, reg + bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb + and WK®, WK®, MASK @ 000000000000000000000gggggg00000 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb + orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb +.endm + +.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src + .elseif numbytes == 8 + pixld , 4, firstreg, SRC, unaligned_src + .elseif numbytes == 4 + pixld , 2, firstreg, SRC, unaligned_src + .endif +.endm -/* - * Note: This code is only using armv5te instructions (not even armv6), - * but is scheduled for ARM Cortex-A8 pipeline. So it might need to - * be split into a few variants, tuned for each microarchitecture. - * - * TODO: In order to get good performance on ARM9/ARM11 cores (which don't - * have efficient write combining), it needs to be changed to use 16-byte - * aligned writes using STM instruction. - * - * Nearest scanline scaler macro template uses the following arguments: - * fname - name of the function to generate - * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes - * t - type suffix for LDR/STR instructions - * prefetch_distance - prefetch in the source image by that many - * pixels ahead - * prefetch_braking_distance - stop prefetching when that many pixels are - * remaining before the end of scanline - */ - -.macro generate_nearest_scanline_func fname, bpp_shift, t, \ - prefetch_distance, \ - prefetch_braking_distance +.macro src_0565_8888_process_tail cond, numbytes, firstreg + .if numbytes == 16 + src_0565_8888_2pixels firstreg, %(firstreg+1) + src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 + src_0565_8888_2pixels firstreg, %(firstreg+1) + .else + src_0565_8888_1pixel firstreg + .endif +.endm -pixman_asm_function fname - W .req r0 - DST .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - TMP1 .req r4 - TMP2 .req r5 - VXMASK .req r6 - PF_OFFS .req r7 - SRC_WIDTH_FIXED .req r8 +generate_composite_function \ + pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ + 3, /* prefetch distance */ \ + src_0565_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_0565_8888_process_head, \ + src_0565_8888_process_tail - ldr UNIT_X, [sp] - push {r4, r5, r6, r7, r8, r10} - mvn VXMASK, #((1 << bpp_shift) - 1) - ldr SRC_WIDTH_FIXED, [sp, #28] - - /* define helper macro */ - .macro scale_2_pixels - ldr&t TMP1, [SRC, TMP1] - and TMP2, VXMASK, VX, asr #(16 - bpp_shift) - adds VX, VX, UNIT_X - str&t TMP1, [DST], #(1 << bpp_shift) -9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b +/******************************************************************************/ - ldr&t TMP2, [SRC, TMP2] - and TMP1, VXMASK, VX, asr #(16 - bpp_shift) - adds VX, VX, UNIT_X - str&t TMP2, [DST], #(1 << bpp_shift) -9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - .endm +.macro add_8_8_8pixels cond, dst1, dst2 + uqadd8&cond WK&dst1, WK&dst1, MASK + uqadd8&cond WK&dst2, WK&dst2, STRIDE_M +.endm + +.macro add_8_8_4pixels cond, dst + uqadd8&cond WK&dst, WK&dst, MASK +.endm - /* now do the scaling */ - and TMP1, VXMASK, VX, asr #(16 - bpp_shift) - adds VX, VX, UNIT_X -9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - subs W, W, #(8 + prefetch_braking_distance) - blt 2f - /* calculate prefetch offset */ - mov PF_OFFS, #prefetch_distance - mla PF_OFFS, UNIT_X, PF_OFFS, VX -1: /* main loop, process 8 pixels per iteration with prefetch */ - pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] - add PF_OFFS, UNIT_X, lsl #3 - scale_2_pixels - scale_2_pixels - scale_2_pixels - scale_2_pixels - subs W, W, #8 - bge 1b -2: - subs W, W, #(4 - 8 - prefetch_braking_distance) - blt 2f -1: /* process the remaining pixels */ - scale_2_pixels - scale_2_pixels - subs W, W, #4 - bge 1b -2: - tst W, #2 - beq 2f - scale_2_pixels -2: - tst W, #1 - ldrne&t TMP1, [SRC, TMP1] - strne&t TMP1, [DST] - /* cleanup helper macro */ - .purgem scale_2_pixels - .unreq DST - .unreq SRC - .unreq W - .unreq VX - .unreq UNIT_X - .unreq TMP1 - .unreq TMP2 - .unreq VXMASK - .unreq PF_OFFS - .unreq SRC_WIDTH_FIXED - /* return */ - pop {r4, r5, r6, r7, r8, r10} - bx lr -.endfunc +.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req MASK + WK5 .req STRIDE_M + .if numbytes == 16 + pixld cond, 8, 4, SRC, unaligned_src + pixld cond, 16, firstreg, DST, 0 + add_8_8_8pixels cond, firstreg, %(firstreg+1) + pixld cond, 8, 4, SRC, unaligned_src + .else + pixld cond, numbytes, 4, SRC, unaligned_src + pixld cond, numbytes, firstreg, DST, 0 + .endif + .unreq WK4 + .unreq WK5 .endm -generate_nearest_scanline_func \ - pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 +.macro add_8_8_process_tail cond, numbytes, firstreg + .if numbytes == 16 + add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 + add_8_8_8pixels cond, firstreg, %(firstreg+1) + .else + add_8_8_4pixels cond, firstreg + .endif +.endm + +generate_composite_function \ + pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 2, /* prefetch distance */ \ + nop_macro, /* init */ \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + add_8_8_process_head, \ + add_8_8_process_tail + +/******************************************************************************/ + +.macro over_8888_8888_init + /* Hold loop invariant in MASK */ + ldr MASK, =0x00800080 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, MASK, MASK + line_saved_regs STRIDE_D, STRIDE_S, ORIG_W +.endm + +.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req STRIDE_M + WK7 .req ORIG_W + pixld , numbytes, %(4+firstreg), SRC, unaligned_src + pixld , numbytes, firstreg, DST, 0 + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 + /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ + teq WK®0, #0 + .if numbytes > 4 + teqeq WK®1, #0 + .if numbytes > 8 + teqeq WK®2, #0 + teqeq WK®3, #0 + .endif + .endif +.endm + +.macro over_8888_8888_prepare next + mov WK&next, WK&next, lsr #24 +.endm + +.macro over_8888_8888_1pixel src, dst, offset, next + /* src = destination component multiplier */ + rsb WK&src, WK&src, #255 + /* Split even/odd bytes of dst into SCRATCH/dst */ + uxtb16 SCRATCH, WK&dst + uxtb16 WK&dst, WK&dst, ror #8 + /* Multiply through, adding 0.5 to the upper byte of result for rounding */ + mla SCRATCH, SCRATCH, WK&src, MASK + mla WK&dst, WK&dst, WK&src, MASK + /* Where we would have had a stall between the result of the first MLA and the shifter input, + * reload the complete source pixel */ + ldr WK&src, [SRC, #offset] + /* Multiply by 257/256 to approximate 256/255 */ + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + /* In this stall, start processing the next pixel */ + .if offset < -4 + mov WK&next, WK&next, lsr #24 + .endif + uxtab16 WK&dst, WK&dst, WK&dst, ror #8 + /* Recombine even/odd bytes of multiplied destination */ + mov SCRATCH, SCRATCH, ror #8 + sel WK&dst, SCRATCH, WK&dst + /* Saturated add of source to multiplied destination */ + uqadd8 WK&dst, WK&dst, WK&src +.endm + +.macro over_8888_8888_process_tail cond, numbytes, firstreg + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req STRIDE_M + WK7 .req ORIG_W + over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) + beq 10f + over_8888_8888_prepare %(4+firstreg) + .set PROCESS_REG, firstreg + .set PROCESS_OFF, -numbytes + .rept numbytes / 4 + over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .set PROCESS_OFF, PROCESS_OFF+4 + .endr + pixst , numbytes, firstreg, DST +10: + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ + over_8888_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + over_8888_8888_process_head, \ + over_8888_8888_process_tail + +/******************************************************************************/ + +/* Multiply each byte of a word by a byte. + * Useful when there aren't any obvious ways to fill the stalls with other instructions. + * word Register containing 4 bytes + * byte Register containing byte multiplier (bits 8-31 must be 0) + * tmp Scratch register + * half Register containing the constant 0x00800080 + * GE[3:0] bits must contain 0101 + */ +.macro mul_8888_8 word, byte, tmp, half + /* Split even/odd bytes of word apart */ + uxtb16 tmp, word + uxtb16 word, word, ror #8 + /* Multiply bytes together with rounding, then by 257/256 */ + mla tmp, tmp, byte, half + mla word, word, byte, half /* 1 stall follows */ + uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ + uxtab16 word, word, word, ror #8 + /* Recombine bytes */ + mov tmp, tmp, ror #8 + sel word, tmp, word +.endm -generate_nearest_scanline_func \ - pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 +/******************************************************************************/ + +.macro over_8888_n_8888_init + /* Mask is constant */ + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] + /* Hold loop invariant in STRIDE_M */ + ldr STRIDE_M, =0x00800080 + /* We only want the alpha bits of the constant mask */ + mov MASK, MASK, lsr #24 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, STRIDE_M, STRIDE_M + line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W +.endm + +.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req Y + WK5 .req STRIDE_D + WK6 .req STRIDE_S + WK7 .req ORIG_W + pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src + pixld , numbytes, firstreg, DST, 0 + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +.macro over_8888_n_8888_1pixel src, dst + mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M + sub WK7, WK6, WK&src, lsr #24 + mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M + uqadd8 WK&dst, WK&dst, WK&src +.endm + +.macro over_8888_n_8888_process_tail cond, numbytes, firstreg + WK4 .req Y + WK5 .req STRIDE_D + WK6 .req STRIDE_S + WK7 .req ORIG_W + over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) + beq 10f + mov WK6, #255 + .set PROCESS_REG, firstreg + .rept numbytes / 4 + .if numbytes == 16 && PROCESS_REG == 2 + /* We're using WK6 and WK7 as temporaries, so half way through + * 4 pixels, reload the second two source pixels but this time + * into WK4 and WK5 */ + ldmdb SRC, {WK4, WK5} + .endif + over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr + pixst , numbytes, firstreg, DST +10: + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ + over_8888_n_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + over_8888_n_8888_process_head, \ + over_8888_n_8888_process_tail + +/******************************************************************************/ + +.macro over_n_8_8888_init + /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ + ldr SRC, [sp, #ARGS_STACK_OFFSET] + /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ + ldr SCRATCH, =0x00800080 + uxtb16 STRIDE_S, SRC + uxtb16 SRC, SRC, ror #8 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, SCRATCH, SCRATCH + line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W +.endm + +.macro over_n_8_8888_newline + ldr STRIDE_D, =0x00800080 + b 1f + .ltorg +1: +.endm + +.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_M + pixld , numbytes/4, 4, MASK, unaligned_mask + pixld , numbytes, firstreg, DST, 0 + .unreq WK4 +.endm + +.macro over_n_8_8888_1pixel src, dst + uxtb Y, WK4, ror #src*8 + /* Trailing part of multiplication of source */ + mla SCRATCH, STRIDE_S, Y, STRIDE_D + mla Y, SRC, Y, STRIDE_D + mov ORIG_W, #255 + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + uxtab16 Y, Y, Y, ror #8 + mov SCRATCH, SCRATCH, ror #8 + sub ORIG_W, ORIG_W, Y, lsr #24 + sel Y, SCRATCH, Y + /* Then multiply the destination */ + mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D + uqadd8 WK&dst, WK&dst, Y +.endm + +.macro over_n_8_8888_process_tail cond, numbytes, firstreg + WK4 .req STRIDE_M + teq WK4, #0 + beq 10f + .set PROCESS_REG, firstreg + .rept numbytes / 4 + over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr + pixst , numbytes, firstreg, DST +10: + .unreq WK4 +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ + over_n_8_8888_init, \ + over_n_8_8888_newline, \ + nop_macro, /* cleanup */ \ + over_n_8_8888_process_head, \ + over_n_8_8888_process_tail + +/******************************************************************************/ +
new file mode 100644 --- /dev/null +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h @@ -0,0 +1,908 @@ +/* + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Ben Avison (bavison@riscosopen.org) + * + */ + +/* + * Because the alignment of pixel data to cachelines, and even the number of + * cachelines per row can vary from row to row, and because of the need to + * preload each scanline once and only once, this prefetch strategy treats + * each row of pixels independently. When a pixel row is long enough, there + * are three distinct phases of prefetch: + * * an inner loop section, where each time a cacheline of data is + * processed, another cacheline is preloaded (the exact distance ahead is + * determined empirically using profiling results from lowlevel-blt-bench) + * * a leading section, where enough cachelines are preloaded to ensure no + * cachelines escape being preloaded when the inner loop starts + * * a trailing section, where a limited number (0 or more) of cachelines + * are preloaded to deal with data (if any) that hangs off the end of the + * last iteration of the inner loop, plus any trailing bytes that were not + * enough to make up one whole iteration of the inner loop + * + * There are (in general) three distinct code paths, selected between + * depending upon how long the pixel row is. If it is long enough that there + * is at least one iteration of the inner loop (as described above) then + * this is described as the "wide" case. If it is shorter than that, but + * there are still enough bytes output that there is at least one 16-byte- + * long, 16-byte-aligned write to the destination (the optimum type of + * write), then this is the "medium" case. If it is not even this long, then + * this is the "narrow" case, and there is no attempt to align writes to + * 16-byte boundaries. In the "medium" and "narrow" cases, all the + * cachelines containing data from the pixel row are prefetched up-front. + */ + +/* + * Determine whether we put the arguments on the stack for debugging. + */ +#undef DEBUG_PARAMS + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_COND_EXEC, 0 +.set FLAG_BRANCH_OVER, 2 +.set FLAG_PROCESS_PRESERVES_PSR, 0 +.set FLAG_PROCESS_CORRUPTS_PSR, 4 +.set FLAG_PROCESS_DOESNT_STORE, 0 +.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ +.set FLAG_NO_SPILL_LINE_VARS, 0 +.set FLAG_SPILL_LINE_VARS_WIDE, 16 +.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 +.set FLAG_SPILL_LINE_VARS, 48 +.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 +.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 + +/* + * Offset into stack where mask and source pointer/stride can be accessed. + */ +#ifdef DEBUG_PARAMS +.set ARGS_STACK_OFFSET, (9*4+9*4) +#else +.set ARGS_STACK_OFFSET, (9*4) +#endif + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 +.set PREFETCH_TYPE_STANDARD, 1 + +/* + * Definitions of macros for load/store of pixel data. + */ + +.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 + .if numbytes == 16 + .if unaligned == 1 + op&r&cond WK®0, [base], #4 + op&r&cond WK®1, [base], #4 + op&r&cond WK®2, [base], #4 + op&r&cond WK®3, [base], #4 + .else + op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} + .endif + .elseif numbytes == 8 + .if unaligned == 1 + op&r&cond WK®0, [base], #4 + op&r&cond WK®1, [base], #4 + .else + op&m&cond&ia base!, {WK®0,WK®1} + .endif + .elseif numbytes == 4 + op&r&cond WK®0, [base], #4 + .elseif numbytes == 2 + op&r&cond&h WK®0, [base], #2 + .elseif numbytes == 1 + op&r&cond&b WK®0, [base], #1 + .else + .error "unsupported size: numbytes" + .endif +.endm + +.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base + .if numbytes == 16 + stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} + .elseif numbytes == 8 + stm&cond&db base, {WK®0,WK®1} + .elseif numbytes == 4 + str&cond WK®0, [base, #-4] + .elseif numbytes == 2 + str&cond&h WK®0, [base, #-2] + .elseif numbytes == 1 + str&cond&b WK®0, [base, #-1] + .else + .error "unsupported size: numbytes" + .endif +.endm + +.macro pixld cond, numbytes, firstreg, base, unaligned + pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned +.endm + +.macro pixst cond, numbytes, firstreg, base + .if (flags) & FLAG_DST_READWRITE + pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .else + pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .endif +.endm + +.macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) + a x + .endif +.endm + + +.macro preload_leading_step1 bpp, ptr, base +/* If the destination is already 16-byte aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ + .if bpp > 0 + PF bic, ptr, base, #31 + .set OFFSET, 0 + .rept prefetch_distance+1 + PF pld, [ptr, #OFFSET] + .set OFFSET, OFFSET+32 + .endr + .endif +.endm + +.macro preload_leading_step2 bpp, bpp_shift, ptr, base +/* However, if the destination is not 16-byte aligned, we may need to + * preload more cache lines than that. The question we need to ask is: + * are the bytes corresponding to the leading pixels more than the amount + * by which the source pointer will be rounded down for preloading, and if + * so, by how many cache lines? Effectively, we want to calculate + * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp + * inner_loop_offset = (src+leading_bytes)&31 + * extra_needed = leading_bytes - inner_loop_offset + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only + * possible when there are 4 src bytes for every 1 dst byte). + */ + .if bpp > 0 + .ifc base,DST + /* The test can be simplified further when preloading the destination */ + PF tst, base, #16 + PF beq, 61f + .else + .if bpp/dst_w_bpp == 4 + PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift + PF and, SCRATCH, SCRATCH, #31 + PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift + PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ + PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ + PF bcs, 61f + PF bpl, 60f + PF pld, [ptr, #32*(prefetch_distance+2)] + .else + PF mov, SCRATCH, base, lsl #32-5 + PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF bls, 61f + .endif + .endif +60: PF pld, [ptr, #32*(prefetch_distance+1)] +61: + .endif +.endm + +#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) +.macro preload_middle bpp, base, scratch_holds_offset + .if bpp > 0 + /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ + .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) + .if scratch_holds_offset + PF pld, [base, SCRATCH] + .else + PF bic, SCRATCH, base, #31 + PF pld, [SCRATCH, #32*prefetch_distance] + .endif + .endif + .endif +.endm + +.macro preload_trailing bpp, bpp_shift, base + .if bpp > 0 + .if bpp*pix_per_block > 256 + /* Calculations are more complex if more than one fetch per block */ + PF and, WK1, base, #31 + PF add, WK1, WK1, WK0, lsl #bpp_shift + PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) + PF bic, SCRATCH, base, #31 +80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] + PF add, SCRATCH, SCRATCH, #32 + PF subs, WK1, WK1, #32 + PF bhi, 80b + .else + /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ + PF mov, SCRATCH, base, lsl #32-5 + PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift + PF adceqs, SCRATCH, SCRATCH, #0 + /* The instruction above has two effects: ensures Z is only + * set if C was clear (so Z indicates that both shifted quantities + * were 0), and clears C if Z was set (so C indicates that the sum + * of the shifted quantities was greater and not equal to 32) */ + PF beq, 82f + PF bic, SCRATCH, base, #31 + PF bcc, 81f + PF pld, [SCRATCH, #32*(prefetch_distance+2)] +81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] +82: + .endif + .endif +.endm + + +.macro preload_line narrow_case, bpp, bpp_shift, base +/* "narrow_case" - just means that the macro was invoked from the "narrow" + * code path rather than the "medium" one - because in the narrow case, + * the row of pixels is known to output no more than 30 bytes, then + * (assuming the source pixels are no wider than the the destination + * pixels) they cannot possibly straddle more than 2 32-byte cachelines, + * meaning there's no need for a loop. + * "bpp" - number of bits per pixel in the channel (source, mask or + * destination) that's being preloaded, or 0 if this channel is not used + * for reading + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) + * "base" - base address register of channel to preload (SRC, MASK or DST) + */ + .if bpp > 0 + .if narrow_case && (bpp <= dst_w_bpp) + /* In these cases, each line for each channel is in either 1 or 2 cache lines */ + PF bic, WK0, base, #31 + PF pld, [WK0] + PF add, WK1, base, X, LSL #bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 90f + PF pld, [WK1] +90: + .else + PF bic, WK0, base, #31 + PF pld, [WK0] + PF add, WK1, base, X, lsl #bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 92f +91: PF add, WK0, WK0, #32 + PF cmp, WK0, WK1 + PF pld, [WK0] + PF bne, 91b +92: + .endif + .endif +.endm + + +.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 + .if decrementx + sub&cond X, X, #8*numbytes/dst_w_bpp + .endif + process_tail cond, numbytes, firstreg + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst cond, numbytes, firstreg, DST + .endif +.endm + +.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_BRANCH_OVER + .ifc cond,mi + bpl 100f + .endif + .ifc cond,cs + bcc 100f + .endif + .ifc cond,ne + beq 100f + .endif + conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx +100: + .else + conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .endif +.endm + +.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) + /* Can't interleave reads and writes */ + test + conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_PROCESS_CORRUPTS_PSR + test + .endif + conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx + .else + /* Can interleave reads and writes for better scheduling */ + test + process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 + process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 + .if decrementx + sub&cond1 X, X, #8*numbytes1/dst_w_bpp + sub&cond2 X, X, #8*numbytes2/dst_w_bpp + .endif + process_tail cond1, numbytes1, firstreg1 + process_tail cond2, numbytes2, firstreg2 + pixst cond1, numbytes1, firstreg1, DST + pixst cond2, numbytes2, firstreg2, DST + .endif +.endm + + +.macro test_bits_1_0_ptr + movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ +.endm + +.macro test_bits_3_2_ptr + movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ +.endm + +.macro leading_15bytes process_head, process_tail + /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ + /* Use unaligned loads in all cases for simplicity */ + .if dst_w_bpp == 8 + conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 + .elseif dst_w_bpp == 16 + test_bits_1_0_ptr + conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 + .endif + conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 +.endm + +.macro test_bits_3_2_pix + movs SCRATCH, X, lsl #dst_bpp_shift+32-3 +.endm + +.macro test_bits_1_0_pix + .if dst_w_bpp == 8 + movs SCRATCH, X, lsl #dst_bpp_shift+32-1 + .else + movs SCRATCH, X, lsr #1 + .endif +.endm + +.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask + conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 + .if dst_w_bpp == 16 + test_bits_1_0_pix + conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 + .elseif dst_w_bpp == 8 + conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 + .endif +.endm + + +.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment +110: + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ + .rept pix_per_block*dst_w_bpp/128 + process_head , 16, 0, unaligned_src, unaligned_mask, 1 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle src_bpp, SRC, 1 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle mask_bpp, MASK, 1 + .else + preload_middle src_bpp, SRC, 0 + preload_middle mask_bpp, MASK, 0 + .endif + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) + /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that + * destination prefetches are 32-byte aligned. It's also the easiest channel to offset + * preloads for, to achieve staggered prefetches for multiple channels, because there are + * always two STMs per prefetch, so there is always an opposite STM on which to put the + * preload. Note, no need to BIC the base register here */ + PF pld, [DST, #32*prefetch_distance - dst_alignment] + .endif + process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + .set SUBBLOCK, SUBBLOCK+1 + .endr + subs X, X, #pix_per_block + bhs 110b +.endm + +.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask + /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ + .if dst_r_bpp > 0 + tst DST, #16 + bne 111f + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + b 112f +111: + .endif + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 +112: + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) + PF and, WK0, X, #pix_per_block-1 + .endif + preload_trailing src_bpp, src_bpp_shift, SRC + preload_trailing mask_bpp, mask_bpp_shift, MASK + preload_trailing dst_r_bpp, dst_bpp_shift, DST + add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp + /* The remainder of the line is handled identically to the medium case */ + medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask +.endm + +.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask +120: + process_head , 16, 0, unaligned_src, unaligned_mask, 0 + process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + subs X, X, #128/dst_w_bpp + bhs 120b + /* Trailing pixels */ + tst X, #128/dst_w_bpp - 1 + beq exit_label + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask + tst X, #16*8/dst_w_bpp + conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 + /* Trailing pixels */ + /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ + .if mask_bpp == 8 || mask_bpp == 16 + tst MASK, #3 + bne 141f + .endif + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 140f + .endif + action process_head, process_tail, process_inner_loop, exit_label, 0, 0 + .if src_bpp == 8 || src_bpp == 16 + b exit_label +140: + action process_head, process_tail, process_inner_loop, exit_label, 1, 0 + .endif + .if mask_bpp == 8 || mask_bpp == 16 + b exit_label +141: + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 142f + .endif + action process_head, process_tail, process_inner_loop, exit_label, 0, 1 + .if src_bpp == 8 || src_bpp == 16 + b exit_label +142: + action process_head, process_tail, process_inner_loop, exit_label, 1, 1 + .endif + .endif +.endm + + +.macro end_of_line restore_x, vars_spilled, loop_label, last_one + .if vars_spilled + /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ + /* This is ldmia sp,{} */ + .word 0xE89D0000 | LINE_SAVED_REGS + .endif + subs Y, Y, #1 + .if vars_spilled + .if (LINE_SAVED_REGS) & (1<<1) + str Y, [sp] + .endif + .endif + add DST, DST, STRIDE_D + .if src_bpp > 0 + add SRC, SRC, STRIDE_S + .endif + .if mask_bpp > 0 + add MASK, MASK, STRIDE_M + .endif + .if restore_x + mov X, ORIG_W + .endif + bhs loop_label + .ifc "last_one","" + .if vars_spilled + b 197f + .else + b 198f + .endif + .else + .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) + b 198f + .endif + .endif +.endm + + +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags_, \ + prefetch_distance_, \ + init, \ + newline, \ + cleanup, \ + process_head, \ + process_tail, \ + process_inner_loop + + .func fname + .global fname + /* For ELF format also set function visibility to hidden */ +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set flags, flags_ + .set prefetch_distance, prefetch_distance_ + +/* + * Select prefetch type for this function. + */ + .if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .else + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD + .endif + + .if src_bpp == 32 + .set src_bpp_shift, 2 + .elseif src_bpp == 24 + .set src_bpp_shift, 0 + .elseif src_bpp == 16 + .set src_bpp_shift, 1 + .elseif src_bpp == 8 + .set src_bpp_shift, 0 + .elseif src_bpp == 0 + .set src_bpp_shift, -1 + .else + .error "requested src bpp (src_bpp) is not supported" + .endif + + .if mask_bpp == 32 + .set mask_bpp_shift, 2 + .elseif mask_bpp == 24 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 8 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 0 + .set mask_bpp_shift, -1 + .else + .error "requested mask bpp (mask_bpp) is not supported" + .endif + + .if dst_w_bpp == 32 + .set dst_bpp_shift, 2 + .elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 + .else + .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + + .if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif + + .set pix_per_block, 16*8/dst_w_bpp + .if src_bpp != 0 + .if 32*8/src_bpp > pix_per_block + .set pix_per_block, 32*8/src_bpp + .endif + .endif + .if mask_bpp != 0 + .if 32*8/mask_bpp > pix_per_block + .set pix_per_block, 32*8/mask_bpp + .endif + .endif + .if dst_r_bpp != 0 + .if 32*8/dst_r_bpp > pix_per_block + .set pix_per_block, 32*8/dst_r_bpp + .endif + .endif + +/* The standard entry conditions set up by pixman-arm-common.h are: + * r0 = width (pixels) + * r1 = height (rows) + * r2 = pointer to top-left pixel of destination + * r3 = destination stride (pixels) + * [sp] = source pixel value, or pointer to top-left pixel of source + * [sp,#4] = 0 or source stride (pixels) + * The following arguments are unused for non-mask operations + * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask + * [sp,#12] = 0 or mask stride (pixels) + */ + +/* + * Assign symbolic names to registers + */ + X .req r0 /* pixels to go on this line */ + Y .req r1 /* lines to go */ + DST .req r2 /* destination pixel pointer */ + STRIDE_D .req r3 /* destination stride (bytes, minus width) */ + SRC .req r4 /* source pixel pointer */ + STRIDE_S .req r5 /* source stride (bytes, minus width) */ + MASK .req r6 /* mask pixel pointer (if applicable) */ + STRIDE_M .req r7 /* mask stride (bytes, minus width) */ + WK0 .req r8 /* pixel data registers */ + WK1 .req r9 + WK2 .req r10 + WK3 .req r11 + SCRATCH .req r12 + ORIG_W .req r14 /* width (pixels) */ + +fname: + push {r4-r11, lr} /* save all registers */ + + subs Y, Y, #1 + blo 199f + +#ifdef DEBUG_PARAMS + sub sp, sp, #9*4 +#endif + + .if src_bpp > 0 + ldr SRC, [sp, #ARGS_STACK_OFFSET] + ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] + .endif + .if mask_bpp > 0 + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] + ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] + .endif + +#ifdef DEBUG_PARAMS + add Y, Y, #1 + stmia sp, {r0-r7,pc} + sub Y, Y, #1 +#endif + + init + + lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ + sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift + .if src_bpp > 0 + lsl STRIDE_S, #src_bpp_shift + sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift + .endif + .if mask_bpp > 0 + lsl STRIDE_M, #mask_bpp_shift + sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift + .endif + + /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ + cmp X, #2*16*8/dst_w_bpp - 1 + blo 170f + .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ + /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ + cmp X, #(prefetch_distance+3)*pix_per_block - 1 + blo 160f + + /* Wide case */ + /* Adjust X so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub X, X, #(prefetch_distance+2)*pix_per_block + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif +151: /* New line */ + newline + preload_leading_step1 src_bpp, WK1, SRC + preload_leading_step1 mask_bpp, WK2, MASK + preload_leading_step1 dst_r_bpp, WK3, DST + + tst DST, #15 + beq 154f + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ + .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) + PF and, WK0, WK0, #15 + .endif + + preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC + preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK + preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST + + leading_15bytes process_head, process_tail + +154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, SRC, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, MASK, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .endif + .ifc "process_inner_loop","" + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f + .else + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f + .endif + +157: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b + .endif + + .ltorg + +160: /* Medium case */ + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif +161: /* New line */ + newline + preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 0, mask_bpp, mask_bpp_shift, MASK + preload_line 0, dst_r_bpp, dst_bpp_shift, DST + + sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ + tst DST, #15 + beq 164f + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ + + leading_15bytes process_head, process_tail + +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ + switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f + +167: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b + + .ltorg + +170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + .if dst_w_bpp < 32 + mov ORIG_W, X + .endif + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif +171: /* New line */ + newline + preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 1, mask_bpp, mask_bpp_shift, MASK + preload_line 1, dst_r_bpp, dst_bpp_shift, DST + + .if dst_w_bpp == 8 + tst DST, #3 + beq 174f +172: subs X, X, #1 + blo 177f + process_head , 1, 0, 1, 1, 0 + process_tail , 1, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 1, 0, DST + .endif + tst DST, #3 + bne 172b + .elseif dst_w_bpp == 16 + tst DST, #2 + beq 174f + subs X, X, #1 + blo 177f + process_head , 2, 0, 1, 1, 0 + process_tail , 2, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 2, 0, DST + .endif + .endif + +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ + switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f + +177: /* Check for another line */ + end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one + +197: + .if (flags) & FLAG_SPILL_LINE_VARS + add sp, sp, #LINE_SAVED_REG_COUNT*4 + .endif +198: + cleanup + +#ifdef DEBUG_PARAMS + add sp, sp, #9*4 /* junk the debug copy of arguments */ +#endif +199: + pop {r4-r11, pc} /* exit */ + + .ltorg + + .unreq X + .unreq Y + .unreq DST + .unreq STRIDE_D + .unreq SRC + .unreq STRIDE_S + .unreq MASK + .unreq STRIDE_M + .unreq WK0 + .unreq WK1 + .unreq WK2 + .unreq WK3 + .unreq SCRATCH + .unreq ORIG_W + .endfunc +.endm + +.macro line_saved_regs x:vararg + .set LINE_SAVED_REGS, 0 + .set LINE_SAVED_REG_COUNT, 0 + .irp SAVED_REG,x + .ifc "SAVED_REG","Y" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_D" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_S" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_M" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","ORIG_W" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .endr +.endm + +.macro nop_macro x:vararg +.endm
--- a/gfx/cairo/libpixman/src/pixman-arm-simd.c +++ b/gfx/cairo/libpixman/src/pixman-arm-simd.c @@ -26,360 +26,26 @@ #ifdef HAVE_CONFIG_H #include <config.h> #endif #include "pixman-private.h" #include "pixman-arm-common.h" #include "pixman-inlines.h" -#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ - -void -pixman_composite_add_8_8_asm_armv6 (int32_t width, - int32_t height, - uint8_t *dst_line, - int32_t dst_stride, - uint8_t *src_line, - int32_t src_stride) -{ - uint8_t *dst, *src; - int32_t w; - uint8_t s, d; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - /* ensure both src and dst are properly aligned before doing 32 bit reads - * we'll stay in this loop if src and dst have differing alignments - */ - while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3))) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - - while (w >= 4) - { - asm ("uqadd8 %0, %1, %2" - : "=r" (*(uint32_t*)dst) - : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); - dst += 4; - src += 4; - w -= 4; - } - - while (w) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - } - -} - -void -pixman_composite_over_8888_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t upper_component_mask = 0xff00ff00; - uint32_t alpha_mask = 0xff; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - "ldr r4, [%[dest]] \n\t" - -#else - "ldr r4, [%[dest]] \n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" -#endif - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* multiply by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - /* recombine the 0xff00ff00 bytes of r6 and r7 */ - "and r7, r7, %[upper_component_mask]\n\t" - "uxtab16 r6, r7, r6, ror #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory" - ); - } -} - -void -pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride, - uint32_t mask) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t alpha_mask = 0xff; - - mask = (mask) >> 24; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - "uxtb16 r6, r5\n\t" - "uxtb16 r7, r5, ror #8\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, %[mask_alpha], %[component_half]\n\t" - "mla r7, r7, %[mask_alpha], %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [mask_alpha] "r" (mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" - ); - } -} - -void -pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t src, - int32_t unused, - uint8_t *mask_line, - int32_t mask_stride) -{ - uint32_t srca; - uint32_t *dst; - uint8_t *mask; - int32_t w; - - srca = src >> 24; - - uint32_t component_mask = 0xff00ff; - uint32_t component_half = 0x800080; - - uint32_t src_hi = (src >> 8) & component_mask; - uint32_t src_lo = src & component_mask; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load mask */ - "ldrb r5, [%[mask]], #1\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, %[src_lo], r5, %[component_half]\n\t" - "mla r7, %[src_hi], r5, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* we could simplify this to use 'sub' if we were - * willing to give up a register for alpha_mask - */ - "mvn r8, r5\n\t" - "mov r8, r8, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) - : [component_half] "r" (component_half), - [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); - } -} - -#endif +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565, + uint16_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8, + uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888, + uint16_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, uint32_t, 1, uint32_t, 1) @@ -387,18 +53,174 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKI PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, uint16_t, uint16_t) PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, uint32_t, uint32_t) +void +pixman_composite_src_n_8888_asm_armv6 (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t src); + +void +pixman_composite_src_n_0565_asm_armv6 (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint16_t src); + +void +pixman_composite_src_n_8_asm_armv6 (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint8_t src); + +static pixman_bool_t +arm_simd_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, /* in 32-bit words */ + int bpp, + int x, + int y, + int width, + int height, + uint32_t _xor) +{ + /* stride is always multiple of 32bit units in pixman */ + uint32_t byte_stride = stride * sizeof(uint32_t); + + switch (bpp) + { + case 8: + pixman_composite_src_n_8_asm_armv6 ( + width, + height, + (uint8_t *)(((char *) bits) + y * byte_stride + x), + byte_stride, + _xor & 0xff); + return TRUE; + case 16: + pixman_composite_src_n_0565_asm_armv6 ( + width, + height, + (uint16_t *)(((char *) bits) + y * byte_stride + x * 2), + byte_stride / 2, + _xor & 0xffff); + return TRUE; + case 32: + pixman_composite_src_n_8888_asm_armv6 ( + width, + height, + (uint32_t *)(((char *) bits) + y * byte_stride + x * 4), + byte_stride / 4, + _xor); + return TRUE; + default: + return FALSE; + } +} + +static pixman_bool_t +arm_simd_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, /* in 32-bit words */ + int dst_stride, /* in 32-bit words */ + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dest_x, + int dest_y, + int width, + int height) +{ + if (src_bpp != dst_bpp) + return FALSE; + + switch (src_bpp) + { + case 8: + pixman_composite_src_8_8_asm_armv6 ( + width, height, + (uint8_t *)(((char *) dst_bits) + + dest_y * dst_stride * 4 + dest_x * 1), dst_stride * 4, + (uint8_t *)(((char *) src_bits) + + src_y * src_stride * 4 + src_x * 1), src_stride * 4); + return TRUE; + case 16: + pixman_composite_src_0565_0565_asm_armv6 ( + width, height, + (uint16_t *)(((char *) dst_bits) + + dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2, + (uint16_t *)(((char *) src_bits) + + src_y * src_stride * 4 + src_x * 2), src_stride * 2); + return TRUE; + case 32: + pixman_composite_src_8888_8888_asm_armv6 ( + width, height, + (uint32_t *)(((char *) dst_bits) + + dest_y * dst_stride * 4 + dest_x * 4), dst_stride, + (uint32_t *)(((char *) src_bits) + + src_y * src_stride * 4 + src_x * 4), src_stride); + return TRUE; + default: + return FALSE; + } +} + static const pixman_fast_path_t arm_simd_fast_paths[] = { + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), + + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888), + + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a1b5g5r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, a4r4g4b4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, a4b4g4r4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565), + + PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, r3g3b2, null, r3g3b2, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, b2g3r3, null, b2g3r3, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, a2r2g2b2, null, a2r2g2b2, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, a2b2g2r2, null, a2b2g2r2, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, c8, null, c8, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, g8, null, g8, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, x4a4, null, x4a4, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8), + + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), @@ -423,10 +245,13 @@ static const pixman_fast_path_t arm_simd { PIXMAN_OP_NONE }, }; pixman_implementation_t * _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) { pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); + imp->blt = arm_simd_blt; + imp->fill = arm_simd_fill; + return imp; }
--- a/gfx/cairo/libpixman/src/pixman-bits-image.c +++ b/gfx/cairo/libpixman/src/pixman-bits-image.c @@ -408,20 +408,118 @@ bits_image_fetch_pixel_convolution (bits sbtot += (int)BLUE_8 (pixel) * f; satot += (int)ALPHA_8 (pixel) * f; } params++; } } - satot >>= 16; - srtot >>= 16; - sgtot >>= 16; - sbtot >>= 16; + satot = (satot + 0x8000) >> 16; + srtot = (srtot + 0x8000) >> 16; + sgtot = (sgtot + 0x8000) >> 16; + sbtot = (sbtot + 0x8000) >> 16; + + satot = CLIP (satot, 0, 0xff); + srtot = CLIP (srtot, 0, 0xff); + sgtot = CLIP (sgtot, 0, 0xff); + sbtot = CLIP (sbtot, 0, 0xff); + + return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot)); +} + +static uint32_t +bits_image_fetch_pixel_separable_convolution (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + pixman_fixed_t *params = image->common.filter_params; + pixman_repeat_t repeat_mode = image->common.repeat; + int width = image->width; + int height = image->height; + int cwidth = pixman_fixed_to_int (params[0]); + int cheight = pixman_fixed_to_int (params[1]); + int x_phase_bits = pixman_fixed_to_int (params[2]); + int y_phase_bits = pixman_fixed_to_int (params[3]); + int x_phase_shift = 16 - x_phase_bits; + int y_phase_shift = 16 - y_phase_bits; + int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1; + int y_off = ((cheight << 16) - pixman_fixed_1) >> 1; + pixman_fixed_t *y_params; + int srtot, sgtot, sbtot, satot; + int32_t x1, x2, y1, y2; + int32_t px, py; + int i, j; + + /* Round x and y to the middle of the closest phase before continuing. This + * ensures that the convolution matrix is aligned right, since it was + * positioned relative to a particular phase (and not relative to whatever + * exact fraction we happen to get here). + */ + x = ((x >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1); + y = ((y >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1); + + px = (x & 0xffff) >> x_phase_shift; + py = (y & 0xffff) >> y_phase_shift; + + y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight; + + x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); + y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); + x2 = x1 + cwidth; + y2 = y1 + cheight; + + srtot = sgtot = sbtot = satot = 0; + + for (i = y1; i < y2; ++i) + { + pixman_fixed_48_16_t fy = *y_params++; + pixman_fixed_t *x_params = params + 4 + px * cwidth; + + if (fy) + { + for (j = x1; j < x2; ++j) + { + pixman_fixed_t fx = *x_params++; + int rx = j; + int ry = i; + + if (fx) + { + pixman_fixed_t f; + uint32_t pixel; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, &rx, width); + repeat (repeat_mode, &ry, height); + + pixel = get_pixel (image, rx, ry, FALSE); + } + else + { + pixel = get_pixel (image, rx, ry, TRUE); + } + + f = (fy * fx + 0x8000) >> 16; + + srtot += (int)RED_8 (pixel) * f; + sgtot += (int)GREEN_8 (pixel) * f; + sbtot += (int)BLUE_8 (pixel) * f; + satot += (int)ALPHA_8 (pixel) * f; + } + } + } + } + + satot = (satot + 0x8000) >> 16; + srtot = (srtot + 0x8000) >> 16; + sgtot = (sgtot + 0x8000) >> 16; + sbtot = (sbtot + 0x8000) >> 16; satot = CLIP (satot, 0, 0xff); srtot = CLIP (srtot, 0, 0xff); sgtot = CLIP (sgtot, 0, 0xff); sbtot = CLIP (sbtot, 0, 0xff); return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot)); } @@ -444,16 +542,20 @@ bits_image_fetch_pixel_filtered (bits_im case PIXMAN_FILTER_BEST: return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel); break; case PIXMAN_FILTER_CONVOLUTION: return bits_image_fetch_pixel_convolution (image, x, y, get_pixel); break; + case PIXMAN_FILTER_SEPARABLE_CONVOLUTION: + return bits_image_fetch_pixel_separable_convolution (image, x, y, get_pixel); + break; + default: break; } return 0; } static uint32_t * @@ -613,19 +715,163 @@ bits_image_fetch_general (pixman_iter_t x += ux; y += uy; w += uw; } return buffer; } -static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; +typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x); + +static force_inline void +bits_image_fetch_separable_convolution_affine (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask, + + convert_pixel_t convert_pixel, + pixman_format_code_t format, + pixman_repeat_t repeat_mode) +{ + bits_image_t *bits = &image->bits; + pixman_fixed_t *params = image->common.filter_params; + int cwidth = pixman_fixed_to_int (params[0]); + int cheight = pixman_fixed_to_int (params[1]); + int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1; + int y_off = ((cheight << 16) - pixman_fixed_1) >> 1; + int x_phase_bits = pixman_fixed_to_int (params[2]); + int y_phase_bits = pixman_fixed_to_int (params[3]); + int x_phase_shift = 16 - x_phase_bits; + int y_phase_shift = 16 - y_phase_bits; + pixman_fixed_t vx, vy; + pixman_fixed_t ux, uy; + pixman_vector_t v; + int k; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + + vx = v.vector[0]; + vy = v.vector[1]; + + for (k = 0; k < width; ++k) + { + pixman_fixed_t *y_params; + int satot, srtot, sgtot, sbtot; + pixman_fixed_t x, y; + int32_t x1, x2, y1, y2; + int32_t px, py; + int i, j; + + if (mask && !mask[k]) + goto next; + + /* Round x and y to the middle of the closest phase before continuing. This + * ensures that the convolution matrix is aligned right, since it was + * positioned relative to a particular phase (and not relative to whatever + * exact fraction we happen to get here). + */ + x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1); + y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1); + + px = (x & 0xffff) >> x_phase_shift; + py = (y & 0xffff) >> y_phase_shift; + + x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); + y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); + x2 = x1 + cwidth; + y2 = y1 + cheight; + + satot = srtot = sgtot = sbtot = 0; -typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x); + y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight; + + for (i = y1; i < y2; ++i) + { + pixman_fixed_t fy = *y_params++; + + if (fy) + { + pixman_fixed_t *x_params = params + 4 + px * cwidth; + + for (j = x1; j < x2; ++j) + { + pixman_fixed_t fx = *x_params++; + int rx = j; + int ry = i; + + if (fx) + { + pixman_fixed_t f; + uint32_t pixel, mask; + uint8_t *row; + + mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, &rx, bits->width); + repeat (repeat_mode, &ry, bits->height); + + row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry; + pixel = convert_pixel (row, rx) | mask; + } + else + { + if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height) + { + pixel = 0; + } + else + { + row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry; + pixel = convert_pixel (row, rx) | mask; + } + } + + f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16; + srtot += (int)RED_8 (pixel) * f; + sgtot += (int)GREEN_8 (pixel) * f; + sbtot += (int)BLUE_8 (pixel) * f; + satot += (int)ALPHA_8 (pixel) * f; + } + } + } + } + + satot = (satot + 0x8000) >> 16; + srtot = (srtot + 0x8000) >> 16; + sgtot = (sgtot + 0x8000) >> 16; + sbtot = (sbtot + 0x8000) >> 16; + + satot = CLIP (satot, 0, 0xff); + srtot = CLIP (srtot, 0, 0xff); + sgtot = CLIP (sgtot, 0, 0xff); + sbtot = CLIP (sbtot, 0, 0xff); + + buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0); + + next: + vx += ux; + vy += uy; + } +} + +static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static force_inline void bits_image_fetch_bilinear_affine (pixman_image_t * image, int offset, int line, int width, uint32_t * buffer, const uint32_t * mask, @@ -863,19 +1109,36 @@ static force_inline uint32_t convert_a8 (const uint8_t *row, int x) { return *(row + x) << 24; } static force_inline uint32_t convert_r5g6b5 (const uint8_t *row, int x) { - return CONVERT_0565_TO_0888 (*((uint16_t *)row + x)); + return convert_0565_to_0888 (*((uint16_t *)row + x)); } +#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode) \ + static uint32_t * \ + bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t *iter, \ + const uint32_t * mask) \ + { \ + bits_image_fetch_separable_convolution_affine ( \ + iter->image, \ + iter->x, iter->y++, \ + iter->width, \ + iter->buffer, mask, \ + convert_ ## format, \ + PIXMAN_ ## format, \ + repeat_mode); \ + \ + return iter->buffer; \ + } + #define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \ static uint32_t * \ bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t *iter, \ const uint32_t * mask) \ { \ bits_image_fetch_bilinear_affine (iter->image, \ iter->x, iter->y++, \ iter->width, \ @@ -898,17 +1161,18 @@ convert_r5g6b5 (const uint8_t *row, int convert_ ## format, \ PIXMAN_ ## format, \ repeat_mode); \ return iter->buffer; \ } #define MAKE_FETCHERS(name, format, repeat_mode) \ MAKE_NEAREST_FETCHER (name, format, repeat_mode) \ - MAKE_BILINEAR_FETCHER (name, format, repeat_mode) + MAKE_BILINEAR_FETCHER (name, format, repeat_mode) \ + MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode) MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD) MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE) MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT) MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL) MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD) MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE) MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT) @@ -1148,31 +1412,46 @@ static const fetcher_info_t fetcher_info #define GENERAL_NEAREST_FLAGS \ (FAST_PATH_NO_ALPHA_MAP | \ FAST_PATH_NO_ACCESSORS | \ FAST_PATH_HAS_TRANSFORM | \ FAST_PATH_AFFINE_TRANSFORM | \ FAST_PATH_NEAREST_FILTER) +#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_SEPARABLE_CONVOLUTION_FILTER) + +#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat) \ + { PIXMAN_ ## format, \ + GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ + bits_image_fetch_separable_convolution_affine_ ## name, \ + _pixman_image_get_scanline_generic_float \ + }, + #define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ { PIXMAN_ ## format, \ GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ bits_image_fetch_bilinear_affine_ ## name, \ _pixman_image_get_scanline_generic_float \ }, #define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \ { PIXMAN_ ## format, \ GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ bits_image_fetch_nearest_affine_ ## name, \ _pixman_image_get_scanline_generic_float \ }, #define AFFINE_FAST_PATHS(name, format, repeat) \ + SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat) \ BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ NEAREST_AFFINE_FAST_PATH(name, format, repeat) AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD) AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE) AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT) AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL) AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
--- a/gfx/cairo/libpixman/src/pixman-combine-float.c +++ b/gfx/cairo/libpixman/src/pixman-combine-float.c @@ -37,16 +37,18 @@ /* Workaround for http://gcc.gnu.org/PR54965 */ /* GCC 4.6 has problems with force_inline, so just use normal inline instead */ #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 6) #undef force_inline #define force_inline __inline__ #endif +#define IS_ZERO(f) (-FLT_MIN < (f) && (f) < FLT_MIN) + typedef float (* combine_channel_t) (float sa, float s, float da, float d); static force_inline void combine_inner (pixman_bool_t component, float *dest, const float *src, const float *mask, int n_pixels, combine_channel_t combine_a, combine_channel_t combine_c) { int i; @@ -196,66 +198,66 @@ get_factor (combine_factor_t factor, flo f = 1 - sa; break; case INV_DA: f = 1 - da; break; case SA_OVER_DA: - if (da == 0.0f) + if (IS_ZERO (da)) f = 1.0f; else f = CLAMP (sa / da); break; case DA_OVER_SA: - if (sa == 0.0f) + if (IS_ZERO (sa)) f = 1.0f; else f = CLAMP (da / sa); break; case INV_SA_OVER_DA: - if (da == 0.0f) + if (IS_ZERO (da)) f = 1.0f; else f = CLAMP ((1.0f - sa) / da); break; case INV_DA_OVER_SA: - if (sa == 0.0f) + if (IS_ZERO (sa)) f = 1.0f; else f = CLAMP ((1.0f - da) / sa); break; case ONE_MINUS_SA_OVER_DA: - if (da == 0.0f) + if (IS_ZERO (da)) f = 0.0f; else f = CLAMP (1.0f - sa / da); break; case ONE_MINUS_DA_OVER_SA: - if (sa == 0.0f) + if (IS_ZERO (sa)) f = 0.0f; else f = CLAMP (1.0f - da / sa); break; case ONE_MINUS_INV_DA_OVER_SA: - if (sa == 0.0f) + if (IS_ZERO (sa)) f = 0.0f; else f = CLAMP (1.0f - (1.0f - da) / sa); break; case ONE_MINUS_INV_SA_OVER_DA: - if (da == 0.0f) + if (IS_ZERO (da)) f = 0.0f; else f = CLAMP (1.0f - (1.0f - sa) / da); break; } return f; } @@ -398,34 +400,34 @@ blend_lighten (float sa, float s, float return s; else return d; } static force_inline float blend_color_dodge (float sa, float s, float da, float d) { - if (d == 0.0f) + if (IS_ZERO (d)) return 0.0f; else if (d * sa >= sa * da - s * da) return sa * da; - else if (sa - s == 0.0f) + else if (IS_ZERO (sa - s)) return sa * da; else return sa * sa * d / (sa - s); } static force_inline float blend_color_burn (float sa, float s, float da, float d) { if (d >= da) return sa * da; else if (sa * (da - d) >= s * da) return 0.0f; - else if (s == 0.0f) + else if (IS_ZERO (s)) return 0.0f; else return sa * (da - sa * (da - d) / s); } static force_inline float blend_hard_light (float sa, float s, float da, float d) { @@ -435,24 +437,24 @@ blend_hard_light (float sa, float s, flo return sa * da - 2 * (da - d) * (sa - s); } static force_inline float blend_soft_light (float sa, float s, float da, float d) { if (2 * s < sa) { - if (da == 0.0f) + if (IS_ZERO (da)) return d * sa; else return d * sa - d * (da - d) * (sa - 2 * s) / da; } else { - if (da == 0.0f) + if (IS_ZERO (da)) { return 0.0f; } else { if (4 * d <= da) return d * sa + (2 * s - sa) * d * ((16 * d / da - 12) * d / da + 3); else @@ -646,45 +648,48 @@ get_sat (const rgb_t *c) } static void clip_color (rgb_t *color, float a) { float l = get_lum (color); float n = channel_min (color); float x = channel_max (color); + float t; if (n < 0.0f) { - if ((l - n) < 4 * FLT_EPSILON) + t = l - n; + if (IS_ZERO (t)) { color->r = 0.0f; color->g = 0.0f; color->b = 0.0f; } else { - color->r = l + (((color->r - l) * l) / (l - n)); - color->g = l + (((color->g - l) * l) / (l - n)); - color->b = l + (((color->b - l) * l) / (l - n)); + color->r = l + (((color->r - l) * l) / t); + color->g = l + (((color->g - l) * l) / t); + color->b = l + (((color->b - l) * l) / t); } } if (x > a) { - if ((x - l) < 4 * FLT_EPSILON) + t = x - l; + if (IS_ZERO (t)) { color->r = a; color->g = a; color->b = a; } else { - color->r = l + (((color->r - l) * (a - l) / (x - l))); - color->g = l + (((color->g - l) * (a - l) / (x - l))); - color->b = l + (((color->b - l) * (a - l) / (x - l))); + color->r = l + (((color->r - l) * (a - l) / t)); + color->g = l + (((color->g - l) * (a - l) / t)); + color->b = l + (((color->b - l) * (a - l) / t)); } } } static void set_lum (rgb_t *color, float sa, float l) { float d = l - get_lum (color); @@ -695,16 +700,17 @@ set_lum (rgb_t *color, float sa, float l clip_color (color, sa); } static void set_sat (rgb_t *src, float sat) { float *max, *mid, *min; + float t; if (src->r > src->g) { if (src->r > src->b) { max = &(src->r); if (src->g > src->b) @@ -745,24 +751,26 @@ set_sat (rgb_t *src, float sat) else { max = &(src->b); mid = &(src->g); } } } - if (*max > *min) + t = *max - *min; + + if (IS_ZERO (t)) { - *mid = (((*mid - *min) * sat) / (*max - *min)); - *max = sat; + *mid = *max = 0.0f; } else { - *mid = *max = 0.0f; + *mid = ((*mid - *min) * sat) / t; + *max = sat; } *min = 0.0f; } /* * Hue: * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
--- a/gfx/cairo/libpixman/src/pixman-combine16.c +++ b/gfx/cairo/libpixman/src/pixman-combine16.c @@ -20,26 +20,16 @@ combine_mask (const uint32_t src, const return 0; s = src; UN8x4_MUL_UN8 (s, m); return s; } -static inline uint32_t convert_0565_to_8888(uint16_t color) -{ - return CONVERT_0565_TO_8888(color); -} - -static inline uint16_t convert_8888_to_0565(uint32_t color) -{ - return CONVERT_8888_TO_0565(color); -} - static void combine_src_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dest, const uint32_t * src, const uint32_t * mask, int width) {
--- a/gfx/cairo/libpixman/src/pixman-combine32.c +++ b/gfx/cairo/libpixman/src/pixman-combine32.c @@ -191,24 +191,68 @@ combine_over_u (pixman_implementation_t pixman_op_t op, uint32_t * dest, const uint32_t * src, const uint32_t * mask, int width) { int i; - for (i = 0; i < width; ++i) + if (!mask) + { + for (i = 0; i < width; ++i) + { + uint32_t s = *(src + i); + uint32_t a = ALPHA_8 (s); + if (a == 0xFF) + { + *(dest + i) = s; + } + else if (s) + { + uint32_t d = *(dest + i); + uint32_t ia = a ^ 0xFF; + UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + *(dest + i) = d; + } + } + } + else { - uint32_t s = combine_mask (src, mask, i); - uint32_t d = *(dest + i); - uint32_t ia = ALPHA_8 (~s); - - UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); - *(dest + i) = d; + for (i = 0; i < width; ++i) + { + uint32_t m = ALPHA_8 (*(mask + i)); + if (m == 0xFF) + { + uint32_t s = *(src + i); + uint32_t a = ALPHA_8 (s); + if (a == 0xFF) + { + *(dest + i) = s; + } + else if (s) + { + uint32_t d = *(dest + i); + uint32_t ia = a ^ 0xFF; + UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + *(dest + i) = d; + } + } + else if (m) + { + uint32_t s = *(src + i); + if (s) + { + uint32_t d = *(dest + i); + UN8x4_MUL_UN8 (s, m); + UN8x4_MUL_UN8_ADD_UN8x4 (d, ALPHA_8 (~s), s); + *(dest + i) = d; + } + } + } } } static void combine_over_reverse_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dest, const uint32_t * src,
--- a/gfx/cairo/libpixman/src/pixman-combine32.h +++ b/gfx/cairo/libpixman/src/pixman-combine32.h @@ -15,28 +15,71 @@ #define RB_MASK_PLUS_ONE 0x10000100 #define ALPHA_8(x) ((x) >> A_SHIFT) #define RED_8(x) (((x) >> R_SHIFT) & MASK) #define GREEN_8(x) (((x) >> G_SHIFT) & MASK) #define BLUE_8(x) ((x) & MASK) /* + * ARMv6 has UQADD8 instruction, which implements unsigned saturated + * addition for 8-bit values packed in 32-bit registers. It is very useful + * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would + * otherwise need a lot of arithmetic operations to simulate this operation). + * Since most of the major ARM linux distros are built for ARMv7, we are + * much less dependent on runtime CPU detection and can get practical + * benefits from conditional compilation here for a lot of users. + */ + +#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \ + !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__)) +#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) + +static force_inline uint32_t +un8x4_add_un8x4 (uint32_t x, uint32_t y) +{ + uint32_t t; + asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y)); + return t; +} + +#define UN8x4_ADD_UN8x4(x, y) \ + ((x) = un8x4_add_un8x4 ((x), (y))) + +#define UN8_rb_ADD_UN8_rb(x, y, t) \ + ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t)) + +#define ADD_UN8(x, y, t) \ + ((t) = (x), un8x4_add_un8x4 ((t), (y))) + +#endif +#endif + +/*****************************************************************************/ + +/* * Helper macros. */ #define MUL_UN8(a, b, t) \ ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT )) #define DIV_UN8(a, b) \ (((uint16_t) (a) * MASK + ((b) / 2)) / (b)) +#ifndef ADD_UN8 #define ADD_UN8(x, y, t) \ ((t) = (x) + (y), \ (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT)))) +#endif #define DIV_ONE_UN8(x) \ (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT) /* * The methods below use some tricks to be able to do two color * components at the same time. */ @@ -51,23 +94,25 @@ t += RB_ONE_HALF; \ x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \ x &= RB_MASK; \ } while (0) /* * x_rb = min (x_rb + y_rb, 255) */ +#ifndef UN8_rb_ADD_UN8_rb #define UN8_rb_ADD_UN8_rb(x, y, t) \ do \ { \ t = ((x) + (y)); \ t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \ x = (t & RB_MASK); \ } while (0) +#endif /* * x_rb = (x_rb * a_rb) / 255 */ #define UN8_rb_MUL_UN8_rb(x, a, t) \ do \ { \ t = (x & MASK) * (a & MASK); \ @@ -203,23 +248,25 @@ UN8_rb_ADD_UN8_rb (r2__, r3__, t__); \ \ x = r1__ | (r2__ << G_SHIFT); \ } while (0) /* x_c = min(x_c + y_c, 255) */ +#ifndef UN8x4_ADD_UN8x4 #define UN8x4_ADD_UN8x4(x, y) \ do \ { \ uint32_t r1__, r2__, r3__, t__; \ \ r1__ = (x) & RB_MASK; \ r2__ = (y) & RB_MASK; \ UN8_rb_ADD_UN8_rb (r1__, r2__, t__); \ \ r2__ = ((x) >> G_SHIFT) & RB_MASK; \ r3__ = ((y) >> G_SHIFT) & RB_MASK; \ UN8_rb_ADD_UN8_rb (r2__, r3__, t__); \ \ x = r1__ | (r2__ << G_SHIFT); \ } while (0) +#endif
--- a/gfx/cairo/libpixman/src/pixman-compiler.h +++ b/gfx/cairo/libpixman/src/pixman-compiler.h @@ -51,16 +51,20 @@ #ifndef INT64_MIN # define INT64_MIN (-9223372036854775807-1) #endif #ifndef INT64_MAX # define INT64_MAX (9223372036854775807) #endif +#ifndef SIZE_MAX +# define SIZE_MAX ((size_t)-1) +#endif + #ifndef M_PI # define M_PI 3.14159265358979323846 #endif #ifdef _MSC_VER /* 'inline' is available only in C++ in MSVC */ # define inline __inline
--- a/gfx/cairo/libpixman/src/pixman-dither.h +++ b/gfx/cairo/libpixman/src/pixman-dither.h @@ -41,11 +41,11 @@ static inline uint16_t dither_32_to_16(u } static inline uint16_t dither_8888_to_0565(uint32_t color, pixman_bool_t toggle) { // alternate between a preturbed truncation and a regular truncation if (toggle) { return dither_32_to_16(color); } else { - return CONVERT_8888_TO_0565(color); + return convert_8888_to_0565(color); } }
--- a/gfx/cairo/libpixman/src/pixman-edge.c +++ b/gfx/cairo/libpixman/src/pixman-edge.c @@ -369,16 +369,17 @@ PIXMAN_RASTERIZE_EDGES (pixman_image_t * PIXMAN_EXPORT void pixman_rasterize_edges (pixman_image_t *image, pixman_edge_t * l, pixman_edge_t * r, pixman_fixed_t t, pixman_fixed_t b) { return_if_fail (image->type == BITS); + return_if_fail (PIXMAN_FORMAT_TYPE (image->bits.format) == PIXMAN_TYPE_A); if (image->bits.read_func || image->bits.write_func) pixman_rasterize_edges_accessors (image, l, r, t, b); else pixman_rasterize_edges_no_accessors (image, l, r, t, b); } #endif
--- a/gfx/cairo/libpixman/src/pixman-fast-path.c +++ b/gfx/cairo/libpixman/src/pixman-fast-path.c @@ -30,17 +30,17 @@ #include <stdlib.h> #include "pixman-private.h" #include "pixman-combine32.h" #include "pixman-inlines.h" static force_inline uint32_t fetch_24 (uint8_t *a) { - if (((unsigned long)a) & 1) + if (((uintptr_t)a) & 1) { #ifdef WORDS_BIGENDIAN return (*a << 16) | (*(uint16_t *)(a + 1)); #else return *a | (*(uint16_t *)(a + 1) << 8); #endif } else @@ -52,17 +52,17 @@ fetch_24 (uint8_t *a) #endif } } static force_inline void store_24 (uint8_t *a, uint32_t v) { - if (((unsigned long)a) & 1) + if (((uintptr_t)a) & 1) { #ifdef WORDS_BIGENDIAN *a = (uint8_t) (v >> 16); *(uint16_t *)(a + 1) = (uint16_t) (v); #else *a = (uint8_t) (v); *(uint16_t *)(a + 1) = (uint16_t) (v >> 8); #endif @@ -502,25 +502,25 @@ fast_composite_over_n_8_0565 (pixman_imp { if (srca == 0xff) { d = src; } else { d = *dst; - d = over (src, CONVERT_0565_TO_0888 (d)); + d = over (src, convert_0565_to_0888 (d)); } - *dst = CONVERT_8888_TO_0565 (d); + *dst = convert_8888_to_0565 (d); } else if (m) { d = *dst; - d = over (in (src, m), CONVERT_0565_TO_0888 (d)); - *dst = CONVERT_8888_TO_0565 (d); + d = over (in (src, m), convert_0565_to_0888 (d)); + *dst = convert_8888_to_0565 (d); } dst++; } } } static void fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, @@ -536,17 +536,17 @@ fast_composite_over_n_8888_0565_ca (pixm int32_t w; src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); srca = src >> 24; if (src == 0) return; - src16 = CONVERT_8888_TO_0565 (src); + src16 = convert_8888_to_0565 (src); PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); while (height--) { dst = dst_line; dst_line += dst_stride; @@ -561,33 +561,33 @@ fast_composite_over_n_8888_0565_ca (pixm { if (srca == 0xff) { *dst = src16; } else { d = *dst; - d = over (src, CONVERT_0565_TO_0888 (d)); - *dst = CONVERT_8888_TO_0565 (d); + d = over (src, convert_0565_to_0888 (d)); + *dst = convert_8888_to_0565 (d); } } else if (ma) { d = *dst; - d = CONVERT_0565_TO_0888 (d); + d = convert_0565_to_0888 (d); s = src; UN8x4_MUL_UN8x4 (s, ma); UN8x4_MUL_UN8 (ma, srca); ma = ~ma; UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); - *dst = CONVERT_8888_TO_0565 (d); + *dst = convert_8888_to_0565 (d); } dst++; } } } static void fast_composite_over_8888_8888 (pixman_implementation_t *imp, @@ -724,56 +724,26 @@ fast_composite_over_8888_0565 (pixman_im { if (a == 0xff) { d = s; } else { d = *dst; - d = over (s, CONVERT_0565_TO_0888 (d)); + d = over (s, convert_0565_to_0888 (d)); } - *dst = CONVERT_8888_TO_0565 (d); + *dst = convert_8888_to_0565 (d); } dst++; } } } static void -fast_composite_src_x888_0565 (pixman_implementation_t *imp, - pixman_composite_info_t *info) -{ - PIXMAN_COMPOSITE_ARGS (info); - uint16_t *dst_line, *dst; - uint32_t *src_line, *src, s; - int dst_stride, src_stride; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - *dst = CONVERT_8888_TO_0565 (s); - dst++; - } - } -} - -static void fast_composite_add_8_8 (pixman_implementation_t *imp, pixman_composite_info_t *info) { PIXMAN_COMPOSITE_ARGS (info); uint8_t *dst_line, *dst; uint8_t *src_line, *src; int dst_stride, src_stride; int32_t w; @@ -833,23 +803,23 @@ fast_composite_add_0565_0565 (pixman_imp w = width; while (w--) { s = *src++; if (s) { d = *dst; - s = CONVERT_0565_TO_8888 (s); + s = convert_0565_to_8888 (s); if (d) { - d = CONVERT_0565_TO_8888 (d); + d = convert_0565_to_8888 (d); UN8x4_ADD_UN8x4 (s, d); } - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); } dst++; } } } static void fast_composite_add_8888_8888 (pixman_implementation_t *imp, @@ -1089,17 +1059,17 @@ fast_composite_over_n_1_0565 (pixman_imp PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, mask_stride, mask_line, 1); mask_line += mask_x >> 5; if (srca == 0xff) { - src565 = CONVERT_8888_TO_0565 (src); + src565 = convert_8888_to_0565 (src); while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; @@ -1137,18 +1107,18 @@ fast_composite_over_n_1_0565 (pixman_imp { if (bitmask == 0) { bitcache = *mask++; bitmask = CREATE_BITMASK (0); } if (bitcache & bitmask) { - d = over (src, CONVERT_0565_TO_0888 (*dst)); - *dst = CONVERT_8888_TO_0565 (d); + d = over (src, convert_0565_to_0888 (*dst)); + *dst = convert_8888_to_0565 (d); } bitmask = UPDATE_BITMASK (bitmask); dst++; } } } } @@ -1171,17 +1141,17 @@ fast_composite_solid_fill (pixman_implem } else if (dest_image->bits.format == PIXMAN_a8) { src = src >> 24; } else if (dest_image->bits.format == PIXMAN_r5g6b5 || dest_image->bits.format == PIXMAN_b5g6r5) { - src = CONVERT_8888_TO_0565 (src); + src = convert_8888_to_0565 (src); } pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, PIXMAN_FORMAT_BPP (dest_image->bits.format), dest_x, dest_y, width, height, src); } @@ -1255,18 +1225,18 @@ scaled_bilinear_scanline_8888_565_OVER ( uint32_t src, result; uint16_t d; d = *dst; src = bilinear_interpolation (tl, tr, bl, br, pixman_fixed_to_bilinear_weight(vx), wb); vx += unit_x; - result = over (src, CONVERT_0565_TO_0888 (d)); - *dst++ = CONVERT_8888_TO_0565(result); + result = over (src, convert_0565_to_0888 (d)); + *dst++ = convert_8888_to_0565 (result); } } static force_inline void scaled_bilinear_scanline_8888_8888_OVER (uint32_t * dst, const uint32_t * mask, const uint32_t * src_top, const uint32_t * src_bottom, @@ -1314,24 +1284,24 @@ scaled_bilinear_scanline_565_565_SRC (ui { while ((w -= 1) >= 0) { uint16_t tl = src_top [pixman_fixed_to_int (vx)]; uint16_t tr = src_top [pixman_fixed_to_int (vx) + 1]; uint16_t bl = src_bottom [pixman_fixed_to_int (vx)]; uint16_t br = src_bottom [pixman_fixed_to_int (vx) + 1]; uint32_t d; - d = bilinear_interpolation(CONVERT_0565_TO_8888(tl), - CONVERT_0565_TO_8888(tr), - CONVERT_0565_TO_8888(bl), - CONVERT_0565_TO_8888(br), - pixman_fixed_to_bilinear_weight(vx), + d = bilinear_interpolation(convert_0565_to_8888 (tl), + convert_0565_to_8888 (tr), + convert_0565_to_8888 (bl), + convert_0565_to_8888 (br), + pixman_fixed_to_bilinear_weight (vx), wb); vx += unit_x; - *dst++ = CONVERT_8888_TO_0565(d); + *dst++ = convert_8888_to_0565 (d); } } #else /* This is a clever low resolution bilinear interpolation inspired by the code in Skia */ @@ -1460,174 +1430,168 @@ FAST_BILINEAR_MAINLOOP_COMMON (8888_8888 static void fast_composite_tiled_repeat (pixman_implementation_t *imp, pixman_composite_info_t *info) { PIXMAN_COMPOSITE_ARGS (info); pixman_composite_func_t func; pixman_format_code_t mask_format; uint32_t src_flags, mask_flags; + int32_t sx, sy; + int32_t width_remain; + int32_t num_pixels; + int32_t src_width; + int32_t i, j; + pixman_image_t extended_src_image; + uint32_t extended_src[REPEAT_MIN_WIDTH * 2]; + pixman_bool_t need_src_extension; + uint32_t *src_line; + int32_t src_stride; + int32_t src_bpp; + pixman_composite_info_t info2 = *info; src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; if (mask_image) { mask_format = mask_image->common.extended_format_code; mask_flags = info->mask_flags; } else { mask_format = PIXMAN_null; mask_flags = FAST_PATH_IS_OPAQUE; } - if (_pixman_implementation_lookup_composite ( - imp->toplevel, info->op, - src_image->common.extended_format_code, src_flags, - mask_format, mask_flags, - dest_image->common.extended_format_code, info->dest_flags, - &imp, &func)) + _pixman_implementation_lookup_composite ( + imp->toplevel, info->op, + src_image->common.extended_format_code, src_flags, + mask_format, mask_flags, + dest_image->common.extended_format_code, info->dest_flags, + &imp, &func); + + src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format); + + if (src_image->bits.width < REPEAT_MIN_WIDTH && + (src_bpp == 32 || src_bpp == 16 || src_bpp == 8) && + !src_image->bits.indexed) { - int32_t sx, sy; - int32_t width_remain; - int32_t num_pixels; - int32_t src_width; - int32_t i, j; - pixman_image_t extended_src_image; - uint32_t extended_src[REPEAT_MIN_WIDTH * 2]; - pixman_bool_t need_src_extension; - uint32_t *src_line; - int32_t src_stride; - int32_t src_bpp; - pixman_composite_info_t info2 = *info; + sx = src_x; + sx = MOD (sx, src_image->bits.width); + sx += width; + src_width = 0; + + while (src_width < REPEAT_MIN_WIDTH && src_width <= sx) + src_width += src_image->bits.width; + + src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t); - src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format); + /* Initialize/validate stack-allocated temporary image */ + _pixman_bits_image_init (&extended_src_image, src_image->bits.format, + src_width, 1, &extended_src[0], src_stride, + FALSE); + _pixman_image_validate (&extended_src_image); + + info2.src_image = &extended_src_image; + need_src_extension = TRUE; + } + else + { + src_width = src_image->bits.width; + need_src_extension = FALSE; + } + + sx = src_x; + sy = src_y; - if (src_image->bits.width < REPEAT_MIN_WIDTH && - (src_bpp == 32 || src_bpp == 16 || src_bpp == 8) && - !src_image->bits.indexed) + while (--height >= 0) + { + sx = MOD (sx, src_width); + sy = MOD (sy, src_image->bits.height); + + if (need_src_extension) { - sx = src_x; - sx = MOD (sx, src_image->bits.width); - sx += width; - src_width = 0; + if (src_bpp == 32) + { + PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1); + + for (i = 0; i < src_width; ) + { + for (j = 0; j < src_image->bits.width; j++, i++) + extended_src[i] = src_line[j]; + } + } + else if (src_bpp == 16) + { + uint16_t *src_line_16; - while (src_width < REPEAT_MIN_WIDTH && src_width <= sx) - src_width += src_image->bits.width; - - src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t); + PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride, + src_line_16, 1); + src_line = (uint32_t*)src_line_16; - /* Initialize/validate stack-allocated temporary image */ - _pixman_bits_image_init (&extended_src_image, src_image->bits.format, - src_width, 1, &extended_src[0], src_stride, - FALSE); - _pixman_image_validate (&extended_src_image); + for (i = 0; i < src_width; ) + { + for (j = 0; j < src_image->bits.width; j++, i++) + ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j]; + } + } + else if (src_bpp == 8) + { + uint8_t *src_line_8; - info2.src_image = &extended_src_image; - need_src_extension = TRUE; + PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride, + src_line_8, 1); + src_line = (uint32_t*)src_line_8; + + for (i = 0; i < src_width; ) + { + for (j = 0; j < src_image->bits.width; j++, i++) + ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j]; + } + } + + info2.src_y = 0; } else { - src_width = src_image->bits.width; - need_src_extension = FALSE; + info2.src_y = sy; + } + + width_remain = width; + + while (width_remain > 0) + { + num_pixels = src_width - sx; + + if (num_pixels > width_remain) + num_pixels = width_remain; + + info2.src_x = sx; + info2.width = num_pixels; + info2.height = 1; + + func (imp, &info2); + + width_remain -= num_pixels; + info2.mask_x += num_pixels; + info2.dest_x += num_pixels; + sx = 0; } sx = src_x; - sy = src_y; - - while (--height >= 0) - { - sx = MOD (sx, src_width); - sy = MOD (sy, src_image->bits.height); - - if (need_src_extension) - { - if (src_bpp == 32) - { - PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1); - - for (i = 0; i < src_width; ) - { - for (j = 0; j < src_image->bits.width; j++, i++) - extended_src[i] = src_line[j]; - } - } - else if (src_bpp == 16) - { - uint16_t *src_line_16; - - PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride, - src_line_16, 1); - src_line = (uint32_t*)src_line_16; - - for (i = 0; i < src_width; ) - { - for (j = 0; j < src_image->bits.width; j++, i++) - ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j]; - } - } - else if (src_bpp == 8) - { - uint8_t *src_line_8; - - PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride, - src_line_8, 1); - src_line = (uint32_t*)src_line_8; + sy++; + info2.mask_x = info->mask_x; + info2.mask_y++; + info2.dest_x = info->dest_x; + info2.dest_y++; + } - for (i = 0; i < src_width; ) - { - for (j = 0; j < src_image->bits.width; j++, i++) - ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j]; - } - } - - info2.src_y = 0; - } - else - { - info2.src_y = sy; - } - - width_remain = width; - - while (width_remain > 0) - { - num_pixels = src_width - sx; - - if (num_pixels > width_remain) - num_pixels = width_remain; - - info2.src_x = sx; - info2.width = num_pixels; - info2.height = 1; - - func (imp, &info2); - - width_remain -= num_pixels; - info2.mask_x += num_pixels; - info2.dest_x += num_pixels; - sx = 0; - } - - sx = src_x; - sy++; - info2.mask_x = info->mask_x; - info2.mask_y++; - info2.dest_x = info->dest_x; - info2.dest_y++; - } - - if (need_src_extension) - _pixman_image_fini (&extended_src_image); - } - else - { - _pixman_log_error (FUNC, "Didn't find a suitable function "); - } + if (need_src_extension) + _pixman_image_fini (&extended_src_image); } /* Use more unrolling for src_0565_0565 because it is typically CPU bound */ static force_inline void scaled_nearest_scanline_565_565_SRC (uint16_t * dst, const uint16_t * src, int32_t w, pixman_fixed_t vx, @@ -2130,20 +2094,16 @@ static const pixman_fast_path_t c_fast_p PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8), PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8), SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888), SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888), SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888), SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888), @@ -2288,22 +2248,22 @@ pixman_fill1_line (uint32_t *dst, int of static void pixman_fill1 (uint32_t *bits, int stride, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { uint32_t *dst = bits + y * stride + (x >> 5); int offs = x & 31; - if (xor & 1) + if (filler & 1) { while (height--) { pixman_fill1_line (dst, offs, width, 1); dst += stride; } } else @@ -2318,21 +2278,21 @@ pixman_fill1 (uint32_t *bits, static void pixman_fill8 (uint32_t *bits, int stride, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { int byte_stride = stride * (int) sizeof (uint32_t); uint8_t *dst = (uint8_t *) bits; - uint8_t v = xor & 0xff; + uint8_t v = filler & 0xff; int i; dst = dst + y * byte_stride + x; while (height--) { for (i = 0; i < width; ++i) dst[i] = v; @@ -2343,22 +2303,22 @@ pixman_fill8 (uint32_t *bits, static void pixman_fill16 (uint32_t *bits, int stride, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { int short_stride = (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t); uint16_t *dst = (uint16_t *)bits; - uint16_t v = xor & 0xffff; + uint16_t v = filler & 0xffff; int i; dst = dst + y * short_stride + x; while (height--) { for (i = 0; i < width; ++i) dst[i] = v; @@ -2369,68 +2329,262 @@ pixman_fill16 (uint32_t *bits, static void pixman_fill32 (uint32_t *bits, int stride, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { int i; bits = bits + y * stride + x; while (height--) { for (i = 0; i < width; ++i) - bits[i] = xor; + bits[i] = filler; bits += stride; } } static pixman_bool_t fast_path_fill (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { switch (bpp) { case 1: - pixman_fill1 (bits, stride, x, y, width, height, xor); + pixman_fill1 (bits, stride, x, y, width, height, filler); break; case 8: - pixman_fill8 (bits, stride, x, y, width, height, xor); + pixman_fill8 (bits, stride, x, y, width, height, filler); break; case 16: - pixman_fill16 (bits, stride, x, y, width, height, xor); + pixman_fill16 (bits, stride, x, y, width, height, filler); break; case 32: - pixman_fill32 (bits, stride, x, y, width, height, xor); + pixman_fill32 (bits, stride, x, y, width, height, filler); break; default: return FALSE; } return TRUE; } +/*****************************************************************************/ + +static uint32_t * +fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) +{ + int32_t w = iter->width; + uint32_t *dst = iter->buffer; + const uint16_t *src = (const uint16_t *)iter->bits; + + iter->bits += iter->stride; + + /* Align the source buffer at 4 bytes boundary */ + if (w > 0 && ((uintptr_t)src & 3)) + { + *dst++ = convert_0565_to_8888 (*src++); + w--; + } + /* Process two pixels per iteration */ + while ((w -= 2) >= 0) + { + uint32_t sr, sb, sg, t0, t1; + uint32_t s = *(const uint32_t *)src; + src += 2; + sr = (s >> 8) & 0x00F800F8; + sb = (s << 3) & 0x00F800F8; + sg = (s >> 3) & 0x00FC00FC; + sr |= sr >> 5; + sb |= sb >> 5; + sg |= sg >> 6; + t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) | + (sb & 0xFF) | 0xFF000000; + t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) | + (sb >> 16) | 0xFF000000; +#ifdef WORDS_BIGENDIAN + *dst++ = t1; + *dst++ = t0; +#else + *dst++ = t0; + *dst++ = t1; +#endif + } + if (w & 1) + { + *dst = convert_0565_to_8888 (*src); + } + + return iter->buffer; +} + +static uint32_t * +fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask) +{ + iter->bits += iter->stride; + return iter->buffer; +} + +/* Helper function for a workaround, which tries to ensure that 0x1F001F + * constant is always allocated in a register on RISC architectures. + */ +static force_inline uint32_t +convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F) +{ + uint32_t a, b; + a = (s >> 3) & x1F001F; + b = s & 0xFC00; + a |= a >> 5; + a |= b >> 5; + return a; +} + +static void +fast_write_back_r5g6b5 (pixman_iter_t *iter) +{ + int32_t w = iter->width; + uint16_t *dst = (uint16_t *)(iter->bits - iter->stride); + const uint32_t *src = iter->buffer; + /* Workaround to ensure that x1F001F variable is allocated in a register */ + static volatile uint32_t volatile_x1F001F = 0x1F001F; + uint32_t x1F001F = volatile_x1F001F; + + while ((w -= 4) >= 0) + { + uint32_t s1 = *src++; + uint32_t s2 = *src++; + uint32_t s3 = *src++; + uint32_t s4 = *src++; + *dst++ = convert_8888_to_0565_workaround (s1, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s2, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s3, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s4, x1F001F); + } + if (w & 2) + { + *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); + *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); + } + if (w & 1) + { + *dst = convert_8888_to_0565_workaround (*src, x1F001F); + } +} + +typedef struct +{ + pixman_format_code_t format; + pixman_iter_get_scanline_t get_scanline; + pixman_iter_write_back_t write_back; +} fetcher_info_t; + +static const fetcher_info_t fetchers[] = +{ + { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 }, + { PIXMAN_null } +}; + +static pixman_bool_t +fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) +{ + pixman_image_t *image = iter->image; + +#define FLAGS \ + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ + FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) + + if (iter->iter_flags & ITER_16) + return FALSE; + + if ((iter->iter_flags & ITER_NARROW) && + (iter->image_flags & FLAGS) == FLAGS) + { + const fetcher_info_t *f; + + for (f = &fetchers[0]; f->format != PIXMAN_null; f++) + { + if (image->common.extended_format_code == f->format) + { + uint8_t *b = (uint8_t *)image->bits.bits; + int s = image->bits.rowstride * 4; + + iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; + iter->stride = s; + + iter->get_scanline = f->get_scanline; + return TRUE; + } + } + } + + return FALSE; +} + +static pixman_bool_t +fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) +{ + pixman_image_t *image = iter->image; + + if (iter->iter_flags & ITER_16) + return FALSE; + + if ((iter->iter_flags & ITER_NARROW) && + (iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS) + { + const fetcher_info_t *f; + + for (f = &fetchers[0]; f->format != PIXMAN_null; f++) + { + if (image->common.extended_format_code == f->format) + { + uint8_t *b = (uint8_t *)image->bits.bits; + int s = image->bits.rowstride * 4; + + iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; + iter->stride = s; + + if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) == + (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) + { + iter->get_scanline = fast_dest_fetch_noop; + } + else + { + iter->get_scanline = f->get_scanline; + } + iter->write_back = f->write_back; + return TRUE; + } + } + } + return FALSE; +} + + pixman_implementation_t * _pixman_implementation_create_fast_path (pixman_implementation_t *fallback) { pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths); imp->fill = fast_path_fill; + imp->src_iter_init = fast_src_iter_init; + imp->dest_iter_init = fast_dest_iter_init; return imp; }
new file mode 100644 --- /dev/null +++ b/gfx/cairo/libpixman/src/pixman-filter.c @@ -0,0 +1,350 @@ +/* + * Copyright 2012, Red Hat, Inc. + * Copyright 2012, Soren Sandmann + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann <soren.sandmann@gmail.com> + */ +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <math.h> +#include <assert.h> +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" + +typedef double (* kernel_func_t) (double x); + +typedef struct +{ + pixman_kernel_t kernel; + kernel_func_t func; + double width; +} filter_info_t; + +static double +impulse_kernel (double x) +{ + return (x == 0.0)? 1.0 : 0.0; +} + +static double +box_kernel (double x) +{ + return 1; +} + +static double +linear_kernel (double x) +{ + return 1 - fabs (x); +} + +static double +gaussian_kernel (double x) +{ +#define SQRT2 (1.4142135623730950488016887242096980785696718753769480) +#define SIGMA (SQRT2 / 2.0) + + return exp (- x * x / (2 * SIGMA * SIGMA)) / (SIGMA * sqrt (2.0 * M_PI)); +} + +static double +sinc (double x) +{ + if (x == 0.0) + return 1.0; + else + return sin (M_PI * x) / (M_PI * x); +} + +static double +lanczos (double x, int n) +{ + return sinc (x) * sinc (x * (1.0 / n)); +} + +static double +lanczos2_kernel (double x) +{ + return lanczos (x, 2); +} + +static double +lanczos3_kernel (double x) +{ + return lanczos (x, 3); +} + +static double +nice_kernel (double x) +{ + return lanczos3_kernel (x * 0.75); +} + +static double +general_cubic (double x, double B, double C) +{ + double ax = fabs(x); + + if (ax < 1) + { + return ((12 - 9 * B - 6 * C) * ax * ax * ax + + (-18 + 12 * B + 6 * C) * ax * ax + (6 - 2 * B)) / 6; + } + else if (ax >= 1 && ax < 2) + { + return ((-B - 6 * C) * ax * ax * ax + + (6 * B + 30 * C) * ax * ax + (-12 * B - 48 * C) * + ax + (8 * B + 24 * C)) / 6; + } + else + { + return 0; + } +} + +static double +cubic_kernel (double x) +{ + /* This is the Mitchell-Netravali filter. + * + * (0.0, 0.5) would give us the Catmull-Rom spline, + * but that one seems to be indistinguishable from Lanczos2. + */ + return general_cubic (x, 1/3.0, 1/3.0); +} + +static const filter_info_t filters[] = +{ + { PIXMAN_KERNEL_IMPULSE, impulse_kernel, 0.0 }, + { PIXMAN_KERNEL_BOX, box_kernel, 1.0 }, + { PIXMAN_KERNEL_LINEAR, linear_kernel, 2.0 }, + { PIXMAN_KERNEL_CUBIC, cubic_kernel, 4.0 }, + { PIXMAN_KERNEL_GAUSSIAN, gaussian_kernel, 6 * SIGMA }, + { PIXMAN_KERNEL_LANCZOS2, lanczos2_kernel, 4.0 }, + { PIXMAN_KERNEL_LANCZOS3, lanczos3_kernel, 6.0 }, + { PIXMAN_KERNEL_LANCZOS3_STRETCHED, nice_kernel, 8.0 }, +}; + +/* This function scales @kernel2 by @scale, then + * aligns @x1 in @kernel1 with @x2 in @kernel2 and + * and integrates the product of the kernels across @width. + * + * This function assumes that the intervals are within + * the kernels in question. E.g., the caller must not + * try to integrate a linear kernel ouside of [-1:1] + */ +static double +integral (pixman_kernel_t kernel1, double x1, + pixman_kernel_t kernel2, double scale, double x2, + double width) +{ + /* If the integration interval crosses zero, break it into + * two separate integrals. This ensures that filters such + * as LINEAR that are not differentiable at 0 will still + * integrate properly. + */ + if (x1 < 0 && x1 + width > 0) + { + return + integral (kernel1, x1, kernel2, scale, x2, - x1) + + integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1); + } + else if (x2 < 0 && x2 + width > 0) + { + return + integral (kernel1, x1, kernel2, scale, x2, - x2) + + integral (kernel1, x1 - x2, kernel2, scale, 0, width + x2); + } + else if (kernel1 == PIXMAN_KERNEL_IMPULSE) + { + assert (width == 0.0); + return filters[kernel2].func (x2 * scale); + } + else if (kernel2 == PIXMAN_KERNEL_IMPULSE) + { + assert (width == 0.0); + return filters[kernel1].func (x1); + } + else + { + /* Integration via Simpson's rule */ +#define N_SEGMENTS 128 +#define SAMPLE(a1, a2) \ + (filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale)) + + double s = 0.0; + double h = width / (double)N_SEGMENTS; + int i; + + s = SAMPLE (x1, x2); + + for (i = 1; i < N_SEGMENTS; i += 2) + { + double a1 = x1 + h * i; + double a2 = x2 + h * i; + + s += 2 * SAMPLE (a1, a2); + + if (i >= 2 && i < N_SEGMENTS - 1) + s += 4 * SAMPLE (a1, a2); + } + + s += SAMPLE (x1 + width, x2 + width); + + return h * s * (1.0 / 3.0); + } +} + +static pixman_fixed_t * +create_1d_filter (int *width, + pixman_kernel_t reconstruct, + pixman_kernel_t sample, + double scale, + int n_phases) +{ + pixman_fixed_t *params, *p; + double step; + double size; + int i; + + size = scale * filters[sample].width + filters[reconstruct].width; + *width = ceil (size); + + p = params = malloc (*width * n_phases * sizeof (pixman_fixed_t)); + if (!params) + return NULL; + + step = 1.0 / n_phases; + + for (i = 0; i < n_phases; ++i) + { + double frac = step / 2.0 + i * step; + pixman_fixed_t new_total; + int x, x1, x2; + double total; + + /* Sample convolution of reconstruction and sampling + * filter. See rounding.txt regarding the rounding + * and sample positions. + */ + + x1 = ceil (frac - *width / 2.0 - 0.5); + x2 = x1 + *width; + + total = 0; + for (x = x1; x < x2; ++x) + { + double pos = x + 0.5 - frac; + double rlow = - filters[reconstruct].width / 2.0; + double rhigh = rlow + filters[reconstruct].width; + double slow = pos - scale * filters[sample].width / 2.0; + double shigh = slow + scale * filters[sample].width; + double c = 0.0; + double ilow, ihigh; + + if (rhigh >= slow && rlow <= shigh) + { + ilow = MAX (slow, rlow); + ihigh = MIN (shigh, rhigh); + + c = integral (reconstruct, ilow, + sample, 1.0 / scale, ilow - pos, + ihigh - ilow); + } + + total += c; + *p++ = (pixman_fixed_t)(c * 65535.0 + 0.5); + } + + /* Normalize */ + p -= *width; + total = 1 / total; + new_total = 0; + for (x = x1; x < x2; ++x) + { + pixman_fixed_t t = (*p) * total + 0.5; + + new_total += t; + *p++ = t; + } + + if (new_total != pixman_fixed_1) + *(p - *width / 2) += (pixman_fixed_1 - new_total); + } + + return params; +} + +/* Create the parameter list for a SEPARABLE_CONVOLUTION filter + * with the given kernels and scale parameters + */ +PIXMAN_EXPORT pixman_fixed_t * +pixman_filter_create_separable_convolution (int *n_values, + pixman_fixed_t scale_x, + pixman_fixed_t scale_y, + pixman_kernel_t reconstruct_x, + pixman_kernel_t reconstruct_y, + pixman_kernel_t sample_x, + pixman_kernel_t sample_y, + int subsample_bits_x, + int subsample_bits_y) +{ + double sx = fabs (pixman_fixed_to_double (scale_x)); + double sy = fabs (pixman_fixed_to_double (scale_y)); + pixman_fixed_t *horz = NULL, *vert = NULL, *params = NULL; + int subsample_x, subsample_y; + int width, height; + + subsample_x = (1 << subsample_bits_x); + subsample_y = (1 << subsample_bits_y); + + horz = create_1d_filter (&width, reconstruct_x, sample_x, sx, subsample_x); + vert = create_1d_filter (&height, reconstruct_y, sample_y, sy, subsample_y); + + if (!horz || !vert) + goto out; + + *n_values = 4 + width * subsample_x + height * subsample_y; + + params = malloc (*n_values * sizeof (pixman_fixed_t)); + if (!params) + goto out; + + params[0] = pixman_int_to_fixed (width); + params[1] = pixman_int_to_fixed (height); + params[2] = pixman_int_to_fixed (subsample_bits_x); + params[3] = pixman_int_to_fixed (subsample_bits_y); + + memcpy (params + 4, horz, + width * subsample_x * sizeof (pixman_fixed_t)); + memcpy (params + 4 + width * subsample_x, vert, + height * subsample_y * sizeof (pixman_fixed_t)); + +out: + free (horz); + free (vert); + + return params; +}
--- a/gfx/cairo/libpixman/src/pixman-general.c +++ b/gfx/cairo/libpixman/src/pixman-general.c @@ -37,27 +37,27 @@ #include <string.h> #include "pixman-private.h" static pixman_bool_t general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) { pixman_image_t *image = iter->image; - if (image->type == SOLID) - _pixman_solid_fill_iter_init (image, iter); - else if (image->type == LINEAR) + if (image->type == LINEAR) _pixman_linear_gradient_iter_init (image, iter); else if (image->type == RADIAL) _pixman_radial_gradient_iter_init (image, iter); else if (image->type == CONICAL) _pixman_conical_gradient_iter_init (image, iter); else if (image->type == BITS) _pixman_bits_image_src_iter_init (image, iter); - else + else if (image->type == SOLID) + _pixman_log_error (FUNC, "Solid image not handled by noop"); + else _pixman_log_error (FUNC, "Pixman bug: unknown image type\n"); return TRUE; } static pixman_bool_t general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) { @@ -198,19 +198,16 @@ general_composite_rect (pixman_implemen /* dest iter */ _pixman_implementation_dest_iter_init ( imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height, dest_buffer, narrow | op_flags[op].dst | rgb16, info->dest_flags); compose = _pixman_implementation_lookup_combiner ( imp->toplevel, op, component_alpha, narrow, !!rgb16); - if (!compose) - return; - for (i = 0; i < height; ++i) { uint32_t *s, *m, *d; m = mask_iter.get_scanline (&mask_iter, NULL); s = src_iter.get_scanline (&src_iter, m); d = dest_iter.get_scanline (&dest_iter, NULL);
--- a/gfx/cairo/libpixman/src/pixman-glyph.c +++ b/gfx/cairo/libpixman/src/pixman-glyph.c @@ -458,26 +458,23 @@ pixman_composite_glyphs_no_mask (pixman_ { if (box32_intersect (&composite_box, pbox, &glyph_box)) { if (glyph_img->common.extended_format_code != glyph_format || glyph_img->common.flags != glyph_flags) { glyph_format = glyph_img->common.extended_format_code; glyph_flags = glyph_img->common.flags; - + _pixman_implementation_lookup_composite ( get_implementation(), op, src->common.extended_format_code, src->common.flags, glyph_format, glyph_flags | extra, dest_format, dest_flags, &implementation, &func); - - if (!func) - goto out; } info.src_x = src_x + composite_box.x1 - dest_x; info.src_y = src_y + composite_box.y1 - dest_y; info.mask_x = composite_box.x1 - (dest_x + glyphs[i].x - glyph->origin_x); info.mask_y = composite_box.y1 - (dest_y + glyphs[i].y - glyph->origin_y); info.dest_x = composite_box.x1; info.dest_y = composite_box.y1; @@ -503,17 +500,17 @@ add_glyphs (pixman_glyph_cache_t *cache, pixman_image_t *dest, int off_x, int off_y, int n_glyphs, const pixman_glyph_t *glyphs) { pixman_format_code_t glyph_format = PIXMAN_null; uint32_t glyph_flags = 0; pixman_composite_func_t func = NULL; pixman_implementation_t *implementation = NULL; - uint32_t dest_format; + pixman_format_code_t dest_format; uint32_t dest_flags; pixman_box32_t dest_box; pixman_composite_info_t info; pixman_image_t *white_img = NULL; pixman_bool_t white_src = FALSE; int i; _pixman_image_validate (dest); @@ -577,19 +574,16 @@ add_glyphs (pixman_glyph_cache_t *cache, } _pixman_implementation_lookup_composite ( get_implementation(), PIXMAN_OP_ADD, src_format, info.src_flags, mask_format, info.mask_flags, dest_format, dest_flags, &implementation, &func); - - if (!func) - goto out; } glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x; glyph_box.y1 = glyphs[i].y - glyph->origin_y + off_y; glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width; glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height; if (box32_intersect (&composite_box, &glyph_box, &dest_box))
--- a/gfx/cairo/libpixman/src/pixman-image.c +++ b/gfx/cairo/libpixman/src/pixman-image.c @@ -368,16 +368,20 @@ compute_image_info (pixman_image_t *imag flags |= FAST_PATH_NEAREST_FILTER; } } break; case PIXMAN_FILTER_CONVOLUTION: break; + case PIXMAN_FILTER_SEPARABLE_CONVOLUTION: + flags |= FAST_PATH_SEPARABLE_CONVOLUTION_FILTER; + break; + default: flags |= FAST_PATH_NO_CONVOLUTION_FILTER; break; } /* Repeat mode */ switch (image->common.repeat) { @@ -514,18 +518,19 @@ compute_image_info (pixman_image_t *imag } /* Both alpha maps and convolution filters can introduce * non-opaqueness in otherwise opaque images. Also * an image with component alpha turned on is only opaque * if all channels are opaque, so we simply turn it off * unconditionally for those images. */ - if (image->common.alpha_map || - image->common.filter == PIXMAN_FILTER_CONVOLUTION || + if (image->common.alpha_map || + image->common.filter == PIXMAN_FILTER_CONVOLUTION || + image->common.filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION || image->common.component_alpha) { flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE); } image->common.flags = flags; image->common.extended_format_code = code; } @@ -678,16 +683,29 @@ pixman_image_set_filter (pixman_image_t int n_params) { image_common_t *common = (image_common_t *)image; pixman_fixed_t *new_params; if (params == common->filter_params && filter == common->filter) return TRUE; + if (filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION) + { + int width = pixman_fixed_to_int (params[0]); + int height = pixman_fixed_to_int (params[1]); + int x_phase_bits = pixman_fixed_to_int (params[2]); + int y_phase_bits = pixman_fixed_to_int (params[3]); + int n_x_phases = (1 << x_phase_bits); + int n_y_phases = (1 << y_phase_bits); + + return_val_if_fail ( + n_params == 4 + n_x_phases * width + n_y_phases * height, FALSE); + } + new_params = NULL; if (params) { new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t)); if (!new_params) return FALSE; memcpy (new_params, @@ -869,17 +887,17 @@ pixman_image_get_depth (pixman_image_t * } PIXMAN_EXPORT pixman_format_code_t pixman_image_get_format (pixman_image_t *image) { if (image->type == BITS) return image->bits.format; - return 0; + return PIXMAN_null; } uint32_t _pixman_image_get_solid (pixman_implementation_t *imp, pixman_image_t * image, pixman_format_code_t format) { uint32_t result;
--- a/gfx/cairo/libpixman/src/pixman-implementation.c +++ b/gfx/cairo/libpixman/src/pixman-implementation.c @@ -60,17 +60,23 @@ typedef struct { pixman_implementation_t * imp; pixman_fast_path_t fast_path; } cache [N_CACHED_FAST_PATHS]; } cache_t; PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache); -pixman_bool_t +static void +dummy_composite_rect (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ +} + +void _pixman_implementation_lookup_composite (pixman_implementation_t *toplevel, pixman_op_t op, pixman_format_code_t src_format, uint32_t src_flags, pixman_format_code_t mask_format, uint32_t mask_flags, pixman_format_code_t dest_format, uint32_t dest_flags, @@ -137,17 +143,21 @@ pixman_bool_t i = N_CACHED_FAST_PATHS - 1; goto update_cache; } ++info; } } - return FALSE; + + /* We should never reach this point */ + _pixman_log_error (FUNC, "No known composite function\n"); + *out_imp = NULL; + *out_func = dummy_composite_rect; update_cache: if (i) { while (i--) cache->cache[i + 1] = cache->cache[i]; cache->cache[0].imp = *out_imp; @@ -155,18 +165,26 @@ update_cache: cache->cache[0].fast_path.src_format = src_format; cache->cache[0].fast_path.src_flags = src_flags; cache->cache[0].fast_path.mask_format = mask_format; cache->cache[0].fast_path.mask_flags = mask_flags; cache->cache[0].fast_path.dest_format = dest_format; cache->cache[0].fast_path.dest_flags = dest_flags; cache->cache[0].fast_path.func = *out_func; } +} - return TRUE; +static void +dummy_combine (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ } pixman_combine_32_func_t _pixman_implementation_lookup_combiner (pixman_implementation_t *imp, pixman_op_t op, pixman_bool_t component_alpha, pixman_bool_t narrow, pixman_bool_t rgb16) @@ -197,17 +215,19 @@ pixman_combine_32_func_t f = (pixman_combine_32_func_t *)imp->combine_16[op]; if (f) return f; imp = imp->fallback; } - return NULL; + /* We should never reach this point */ + _pixman_log_error (FUNC, "No known combine function\n"); + return dummy_combine; } pixman_bool_t _pixman_implementation_blt (pixman_implementation_t * imp, uint32_t * src_bits, uint32_t * dst_bits, int src_stride, int dst_stride, @@ -240,22 +260,22 @@ pixman_bool_t _pixman_implementation_fill (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { while (imp) { if (imp->fill && - ((*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor))) + ((*imp->fill) (imp, bits, stride, bpp, x, y, width, height, filler))) { return TRUE; } imp = imp->fallback; } return FALSE;
--- a/gfx/cairo/libpixman/src/pixman-inlines.h +++ b/gfx/cairo/libpixman/src/pixman-inlines.h @@ -83,16 +83,52 @@ repeat (pixman_repeat_t repeat, int *c, static force_inline int pixman_fixed_to_bilinear_weight (pixman_fixed_t x) { return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) & ((1 << BILINEAR_INTERPOLATION_BITS) - 1); } +#if BILINEAR_INTERPOLATION_BITS <= 4 +/* Inspired by Filter_32_opaque from Skia */ +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, + uint32_t bl, uint32_t br, + int distx, int disty) +{ + int distxy, distxiy, distixy, distixiy; + uint32_t lo, hi; + + distx <<= (4 - BILINEAR_INTERPOLATION_BITS); + disty <<= (4 - BILINEAR_INTERPOLATION_BITS); + + distxy = distx * disty; + distxiy = (distx << 4) - distxy; /* distx * (16 - disty) */ + distixy = (disty << 4) - distxy; /* disty * (16 - distx) */ + distixiy = + 16 * 16 - (disty << 4) - + (distx << 4) + distxy; /* (16 - distx) * (16 - disty) */ + + lo = (tl & 0xff00ff) * distixiy; + hi = ((tl >> 8) & 0xff00ff) * distixiy; + + lo += (tr & 0xff00ff) * distxiy; + hi += ((tr >> 8) & 0xff00ff) * distxiy; + + lo += (bl & 0xff00ff) * distixy; + hi += ((bl >> 8) & 0xff00ff) * distixy; + + lo += (br & 0xff00ff) * distxy; + hi += ((br >> 8) & 0xff00ff) * distxy; + + return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff); +} + +#else #if SIZEOF_LONG > 4 static force_inline uint32_t bilinear_interpolation (uint32_t tl, uint32_t tr, uint32_t bl, uint32_t br, int distx, int disty) { uint64_t distxy, distxiy, distixy, distixiy; @@ -207,16 +243,17 @@ bilinear_interpolation (uint32_t tl, uin f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; r |= f & 0xff000000; return r; } #endif #endif +#endif // BILINEAR_INTERPOLATION_BITS <= 4 /* * For each scanline fetched from source image with PAD repeat: * - calculate how many pixels need to be padded on the left side * - calculate how many pixels need to be padded on the right side * - update width to only count pixels which are fetched from the image * All this information is returned via 'width', 'left_pad', 'right_pad' * arguments. The code is assuming that 'unit_x' is positive. @@ -337,75 +374,75 @@ scanline_func_name (dst_type_t *ds \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ { \ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \ \ if (a1 == 0xff) \ { \ - *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1); \ } \ else if (s1) \ { \ - d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \ - s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + d = convert_ ## DST_FORMAT ## _to_8888 (*dst); \ + s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1); \ a1 ^= 0xff; \ UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ - *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + *dst = convert_8888_to_ ## DST_FORMAT (d); \ } \ dst++; \ \ if (a2 == 0xff) \ { \ - *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2); \ } \ else if (s2) \ { \ - d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ - s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \ + d = convert_## DST_FORMAT ## _to_8888 (*dst); \ + s2 = convert_## SRC_FORMAT ## _to_8888 (s2); \ a2 ^= 0xff; \ UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \ - *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + *dst = convert_8888_to_ ## DST_FORMAT (d); \ } \ dst++; \ } \ else /* PIXMAN_OP_SRC */ \ { \ - *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ - *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + *dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1); \ + *dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2); \ } \ } \ \ if (w & 1) \ { \ x1 = pixman_fixed_to_int (vx); \ s1 = *(src + x1); \ \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ { \ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ \ if (a1 == 0xff) \ { \ - *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1); \ } \ else if (s1) \ { \ - d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ - s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + d = convert_## DST_FORMAT ## _to_8888 (*dst); \ + s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1); \ a1 ^= 0xff; \ UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ - *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + *dst = convert_8888_to_ ## DST_FORMAT (d); \ } \ dst++; \ } \ else /* PIXMAN_OP_SRC */ \ { \ - *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + *dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1); \ } \ } \ } #define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ dst_type_t, repeat_mode, have_mask, mask_is_solid) \ static void \ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \
--- a/gfx/cairo/libpixman/src/pixman-linear-gradient.c +++ b/gfx/cairo/libpixman/src/pixman-linear-gradient.c @@ -219,23 +219,16 @@ linear_get_scanline_narrow (pixman_iter_ } } iter->y++; return iter->buffer; } -static uint16_t convert_8888_to_0565(uint32_t color) -{ - return CONVERT_8888_TO_0565(color); -} - - - static uint32_t * linear_get_scanline_16 (pixman_iter_t *iter, const uint32_t *mask) { pixman_image_t *image = iter->image; int x = iter->x; int y = iter->y; int width = iter->width;
--- a/gfx/cairo/libpixman/src/pixman-matrix.c +++ b/gfx/cairo/libpixman/src/pixman-matrix.c @@ -29,95 +29,399 @@ #endif #include <math.h> #include <string.h> #include "pixman-private.h" #define F(x) pixman_int_to_fixed (x) +static force_inline int +count_leading_zeros (uint32_t x) +{ +#ifdef __GNUC__ + return __builtin_clz (x); +#else + int n = 0; + while (x) + { + n++; + x >>= 1; + } + return 32 - n; +#endif +} + +/* + * Large signed/unsigned integer division with rounding for the platforms with + * only 64-bit integer data type supported (no 128-bit data type). + * + * Arguments: + * hi, lo - high and low 64-bit parts of the dividend + * div - 48-bit divisor + * + * Returns: lowest 64 bits of the result as a return value and highest 64 + * bits of the result to "result_hi" pointer + */ + +/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */ +static force_inline uint64_t +rounded_udiv_128_by_48 (uint64_t hi, + uint64_t lo, + uint64_t div, + uint64_t *result_hi) +{ + uint64_t tmp, remainder, result_lo; + assert(div < ((uint64_t)1 << 48)); + + remainder = hi % div; + *result_hi = hi / div; + + tmp = (remainder << 16) + (lo >> 48); + result_lo = tmp / div; + remainder = tmp % div; + + tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF); + result_lo = (result_lo << 16) + (tmp / div); + remainder = tmp % div; + + tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF); + result_lo = (result_lo << 16) + (tmp / div); + remainder = tmp % div; + + tmp = (remainder << 16) + (lo & 0xFFFF); + result_lo = (result_lo << 16) + (tmp / div); + remainder = tmp % div; + + /* round to nearest */ + if (remainder * 2 >= div && ++result_lo == 0) + *result_hi += 1; + + return result_lo; +} + +/* signed division (128-bit by 49-bit) with rounding to nearest */ +static inline int64_t +rounded_sdiv_128_by_49 (int64_t hi, + uint64_t lo, + int64_t div, + int64_t *signed_result_hi) +{ + uint64_t result_lo, result_hi; + int sign = 0; + if (div < 0) + { + div = -div; + sign ^= 1; + } + if (hi < 0) + { + if (lo != 0) + hi++; + hi = -hi; + lo = -lo; + sign ^= 1; + } + result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi); + if (sign) + { + if (result_lo != 0) + result_hi++; + result_hi = -result_hi; + result_lo = -result_lo; + } + if (signed_result_hi) + { + *signed_result_hi = result_hi; + } + return result_lo; +} + +/* + * Multiply 64.16 fixed point value by (2^scalebits) and convert + * to 128-bit integer. + */ +static force_inline void +fixed_64_16_to_int128 (int64_t hi, + int64_t lo, + int64_t *rhi, + int64_t *rlo, + int scalebits) +{ + /* separate integer and fractional parts */ + hi += lo >> 16; + lo &= 0xFFFF; + + if (scalebits <= 0) + { + *rlo = hi >> (-scalebits); + *rhi = *rlo >> 63; + } + else + { + *rhi = hi >> (64 - scalebits); + *rlo = (uint64_t)hi << scalebits; + if (scalebits < 16) + *rlo += lo >> (16 - scalebits); + else + *rlo += lo << (scalebits - 16); + } +} + +/* + * Convert 112.16 fixed point value to 48.16 with clamping for the out + * of range values. + */ +static force_inline pixman_fixed_48_16_t +fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag) +{ + if ((lo >> 63) != hi) + { + *clampflag = TRUE; + return hi >= 0 ? INT64_MAX : INT64_MIN; + } + else + { + return lo; + } +} + +/* + * Transform a point with 31.16 fixed point coordinates from the destination + * space to a point with 48.16 fixed point coordinates in the source space. + * No overflows are possible for affine transformations and the results are + * accurate including the least significant bit. Projective transformations + * may overflow, in this case the results are just clamped to return maximum + * or minimum 48.16 values (so that the caller can at least handle the NONE + * and PAD repeats correctly) and the return value is FALSE to indicate that + * such clamping has happened. + */ +PIXMAN_EXPORT pixman_bool_t +pixman_transform_point_31_16 (const pixman_transform_t *t, + const pixman_vector_48_16_t *v, + pixman_vector_48_16_t *result) +{ + pixman_bool_t clampflag = FALSE; + int i; + int64_t tmp[3][2], divint; + uint16_t divfrac; + + /* input vector values must have no more than 31 bits (including sign) + * in the integer part */ + assert (v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + + for (i = 0; i < 3; i++) + { + tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16); + tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF); + tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16); + tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF); + tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16); + tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF); + } + + /* + * separate 64-bit integer and 16-bit fractional parts for the divisor, + * which is also scaled by 65536 after fixed point multiplication. + */ + divint = tmp[2][0] + (tmp[2][1] >> 16); + divfrac = tmp[2][1] & 0xFFFF; + + if (divint == pixman_fixed_1 && divfrac == 0) + { + /* + * this is a simple affine transformation + */ + result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); + result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); + result->v[2] = pixman_fixed_1; + } + else if (divint == 0 && divfrac == 0) + { + /* + * handle zero divisor (if the values are non-zero, set the + * results to maximum positive or minimum negative) + */ + clampflag = TRUE; + + result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); + result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); + + if (result->v[0] > 0) + result->v[0] = INT64_MAX; + else if (result->v[0] < 0) + result->v[0] = INT64_MIN; + + if (result->v[1] > 0) + result->v[1] = INT64_MAX; + else if (result->v[1] < 0) + result->v[1] = INT64_MIN; + } + else + { + /* + * projective transformation, analyze the top 32 bits of the divisor + */ + int32_t hi32divbits = divint >> 32; + if (hi32divbits < 0) + hi32divbits = ~hi32divbits; + + if (hi32divbits == 0) + { + /* the divisor is small, we can actually keep all the bits */ + int64_t hi, rhi, lo, rlo; + int64_t div = (divint << 16) + divfrac; + + fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32); + rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); + result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); + + fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32); + rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); + result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); + } + else + { + /* the divisor needs to be reduced to 48 bits */ + int64_t hi, rhi, lo, rlo, div; + int shift = 32 - count_leading_zeros (hi32divbits); + fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift); + + fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift); + rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); + result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); + + fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift); + rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); + result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); + } + } + result->v[2] = pixman_fixed_1; + return !clampflag; +} + +PIXMAN_EXPORT void +pixman_transform_point_31_16_affine (const pixman_transform_t *t, + const pixman_vector_48_16_t *v, + pixman_vector_48_16_t *result) +{ + int64_t hi0, lo0, hi1, lo1; + + /* input vector values must have no more than 31 bits (including sign) + * in the integer part */ + assert (v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + + hi0 = (int64_t)t->matrix[0][0] * (v->v[0] >> 16); + lo0 = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF); + hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16); + lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF); + hi0 += (int64_t)t->matrix[0][2]; + + hi1 = (int64_t)t->matrix[1][0] * (v->v[0] >> 16); + lo1 = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF); + hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16); + lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF); + hi1 += (int64_t)t->matrix[1][2]; + + result->v[0] = hi0 + ((lo0 + 0x8000) >> 16); + result->v[1] = hi1 + ((lo1 + 0x8000) >> 16); + result->v[2] = pixman_fixed_1; +} + +PIXMAN_EXPORT void +pixman_transform_point_31_16_3d (const pixman_transform_t *t, + const pixman_vector_48_16_t *v, + pixman_vector_48_16_t *result) +{ + int i; + int64_t tmp[3][2]; + + /* input vector values must have no more than 31 bits (including sign) + * in the integer part */ + assert (v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16))); + assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + + for (i = 0; i < 3; i++) + { + tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16); + tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF); + tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16); + tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF); + tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16); + tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF); + } + + result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); + result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); + result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16); +} + PIXMAN_EXPORT void pixman_transform_init_identity (struct pixman_transform *matrix) { int i; memset (matrix, '\0', sizeof (struct pixman_transform)); for (i = 0; i < 3; i++) matrix->matrix[i][i] = F (1); } typedef pixman_fixed_32_32_t pixman_fixed_34_30_t; PIXMAN_EXPORT pixman_bool_t pixman_transform_point_3d (const struct pixman_transform *transform, struct pixman_vector * vector) { - struct pixman_vector result; - pixman_fixed_32_32_t partial; - pixman_fixed_48_16_t v; - int i, j; + pixman_vector_48_16_t tmp; + tmp.v[0] = vector->vector[0]; + tmp.v[1] = vector->vector[1]; + tmp.v[2] = vector->vector[2]; + + pixman_transform_point_31_16_3d (transform, &tmp, &tmp); - for (j = 0; j < 3; j++) - { - v = 0; - for (i = 0; i < 3; i++) - { - partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] * - (pixman_fixed_48_16_t) vector->vector[i]); - v += partial >> 16; - } - - if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16) - return FALSE; - - result.vector[j] = (pixman_fixed_t) v; - } - - *vector = result; + vector->vector[0] = tmp.v[0]; + vector->vector[1] = tmp.v[1]; + vector->vector[2] = tmp.v[2]; - if (!result.vector[2]) - return FALSE; - - return TRUE; + return vector->vector[0] == tmp.v[0] && + vector->vector[1] == tmp.v[1] && + vector->vector[2] == tmp.v[2]; } PIXMAN_EXPORT pixman_bool_t pixman_transform_point (const struct pixman_transform *transform, struct pixman_vector * vector) { - pixman_fixed_32_32_t partial; - pixman_fixed_34_30_t v[3]; - pixman_fixed_48_16_t quo; - int i, j; + pixman_vector_48_16_t tmp; + tmp.v[0] = vector->vector[0]; + tmp.v[1] = vector->vector[1]; + tmp.v[2] = vector->vector[2]; + + if (!pixman_transform_point_31_16 (transform, &tmp, &tmp)) + return FALSE; - for (j = 0; j < 3; j++) - { - v[j] = 0; - - for (i = 0; i < 3; i++) - { - partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] * - (pixman_fixed_32_32_t) vector->vector[i]); - v[j] += partial >> 2; - } - } - - if (!(v[2] >> 16)) - return FALSE; + vector->vector[0] = tmp.v[0]; + vector->vector[1] = tmp.v[1]; + vector->vector[2] = tmp.v[2]; - for (j = 0; j < 2; j++) - { - quo = v[j] / (v[2] >> 16); - if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16) - return FALSE; - vector->vector[j] = (pixman_fixed_t) quo; - } - - vector->vector[2] = pixman_fixed_1; - return TRUE; + return vector->vector[0] == tmp.v[0] && + vector->vector[1] == tmp.v[1] && + vector->vector[2] == tmp.v[2]; } PIXMAN_EXPORT pixman_bool_t pixman_transform_multiply (struct pixman_transform * dst, const struct pixman_transform *l, const struct pixman_transform *r) { struct pixman_transform d; @@ -133,17 +437,17 @@ pixman_transform_multiply (struct pixman v = 0; for (o = 0; o < 3; o++) { partial = (pixman_fixed_32_32_t) l->matrix[dy][o] * (pixman_fixed_32_32_t) r->matrix[o][dx]; - v += partial >> 16; + v += (partial + 0x8000) >> 16; } if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16) return FALSE; d.matrix[dy][dx] = (pixman_fixed_t) v; } } @@ -331,24 +635,24 @@ pixman_transform_bounds (const struct pi return TRUE; } PIXMAN_EXPORT pixman_bool_t pixman_transform_invert (struct pixman_transform * dst, const struct pixman_transform *src) { - struct pixman_f_transform m, r; + struct pixman_f_transform m; pixman_f_transform_from_pixman_transform (&m, src); - if (!pixman_f_transform_invert (&r, &m)) + if (!pixman_f_transform_invert (&m, &m)) return FALSE; - if (!pixman_transform_from_pixman_f_transform (dst, &r)) + if (!pixman_transform_from_pixman_f_transform (dst, &m)) return FALSE; return TRUE; } static pixman_bool_t within_epsilon (pixman_fixed_t a, pixman_fixed_t b, @@ -464,20 +768,21 @@ pixman_transform_from_pixman_f_transform return TRUE; } PIXMAN_EXPORT pixman_bool_t pixman_f_transform_invert (struct pixman_f_transform * dst, const struct pixman_f_transform *src) { + static const int a[3] = { 2, 2, 1 }; + static const int b[3] = { 1, 0, 0 }; + pixman_f_transform_t d; double det; int i, j; - static const int a[3] = { 2, 2, 1 }; - static const int b[3] = { 1, 0, 0 }; det = 0; for (i = 0; i < 3; i++) { double p; int ai = a[i]; int bi = b[i]; p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] - @@ -502,20 +807,22 @@ pixman_f_transform_invert (struct pixman int bj = b[j]; p = (src->m[ai][aj] * src->m[bi][bj] - src->m[ai][bj] * src->m[bi][aj]); if (((i + j) & 1) != 0) p = -p; - dst->m[j][i] = det * p; + d.m[j][i] = det * p; } } + *dst = d; + return TRUE; } PIXMAN_EXPORT pixman_bool_t pixman_f_transform_point (const struct pixman_f_transform *t, struct pixman_f_vector * v) { struct pixman_f_vector result;
--- a/gfx/cairo/libpixman/src/pixman-mips-dspr2-asm.S +++ b/gfx/cairo/libpixman/src/pixman-mips-dspr2-asm.S @@ -305,16 +305,149 @@ 3: bnez a2, 3b addiu a0, a0, 4 4: jr ra nop END(pixman_composite_src_x888_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + + SAVE_REGS_ON_STACK 0, v0 + li v0, 0x00ff00ff + + beqz a3, 3f + nop + addiu t1, a3, -1 + beqz t1, 2f + nop + +1: + /* a1 = source (32bit constant) */ + lbu t0, 0(a2) /* t2 = mask (a8) */ + lbu t1, 1(a2) /* t3 = mask (a8) */ + addiu a2, a2, 2 + + MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9 + + sw t2, 0(a0) + sw t3, 4(a0) + addiu a3, a3, -2 + addiu t2, a3, -1 + bgtz t2, 1b + addiu a0, a0, 8 + + beqz a3, 3f + nop + +2: + lbu t0, 0(a2) + addiu a2, a2, 1 + + MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5 + + sw t1, 0(a0) + addiu a3, a3, -1 + addiu a0, a0, 4 + +3: + RESTORE_REGS_FROM_STACK 0, v0 + j ra + nop + +END(pixman_composite_src_n_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips) +/* + * a0 - dst (a8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + li t9, 0x00ff00ff + beqz a3, 3f + nop + srl t7, a3, 2 /* t7 = how many multiples of 4 dst pixels */ + beqz t7, 1f /* branch if less than 4 src pixels */ + nop + + srl t8, a1, 24 + replv.ph t8, t8 + +0: + beqz t7, 1f + addiu t7, t7, -1 + lbu t0, 0(a2) + lbu t1, 1(a2) + lbu t2, 2(a2) + lbu t3, 3(a2) + + addiu a2, a2, 4 + + precr_sra.ph.w t1, t0, 0 + precr_sra.ph.w t3, t2, 0 + precr.qb.ph t0, t3, t1 + + muleu_s.ph.qbl t2, t0, t8 + muleu_s.ph.qbr t3, t0, t8 + shra_r.ph t4, t2, 8 + shra_r.ph t5, t3, 8 + and t4, t4, t9 + and t5, t5, t9 + addq.ph t2, t2, t4 + addq.ph t3, t3, t5 + shra_r.ph t2, t2, 8 + shra_r.ph t3, t3, 8 + precr.qb.ph t2, t2, t3 + + sb t2, 0(a0) + srl t2, t2, 8 + sb t2, 1(a0) + srl t2, t2, 8 + sb t2, 2(a0) + srl t2, t2, 8 + sb t2, 3(a0) + addiu a3, a3, -4 + b 0b + addiu a0, a0, 4 + +1: + beqz a3, 3f + nop + srl t8, a1, 24 +2: + lbu t0, 0(a2) + addiu a2, a2, 1 + + mul t2, t0, t8 + shra_r.ph t3, t2, 8 + andi t3, t3, 0x00ff + addq.ph t2, t2, t3 + shra_r.ph t2, t2, 8 + + sb t2, 0(a0) + addiu a3, a3, -1 + bnez a3, 2b + addiu a0, a0, 1 + +3: + j ra + nop + +END(pixman_composite_src_n_8_8_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips) /* * a0 - dst (a8r8g8b8) * a1 - src (32bit constant) * a2 - mask (a8r8g8b8) * a3 - w */ @@ -1204,16 +1337,150 @@ 21: 3: RESTORE_REGS_FROM_STACK 0, s0, s1, s2 j ra nop END(pixman_composite_over_8888_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (32bit constant) + * a2 - w + */ + + beqz a2, 5f + nop + + not t0, a1 + srl t0, t0, 24 + bgtz t0, 1f + nop + CONVERT_1x8888_TO_1x0565 a1, t1, t2, t3 +0: + sh t1, 0(a0) + addiu a2, a2, -1 + bgtz a2, 0b + addiu a0, a0, 2 + j ra + nop + +1: + SAVE_REGS_ON_STACK 0, s0, s1, s2 + li t4, 0x00ff00ff + li t5, 0xf800f800 + li t6, 0x07e007e0 + li t7, 0x001F001F + addiu t1, a2, -1 + beqz t1, 3f + nop +2: + lhu t1, 0(a0) /* t1 = destination (r5g6b5) */ + lhu t2, 2(a0) /* t2 = destination (r5g6b5) */ + + CONVERT_2x0565_TO_2x8888 t1, t2, t3, t8, t6, t7, t9, s0, s1, s2 + MIPS_2xUN8x4_MUL_2xUN8 t3, t8, t0, t0, t1, t2, t4, t9, s0, s1, s2, t3, t8 + addu_s.qb t1, t1, a1 + addu_s.qb t2, t2, a1 + CONVERT_2x8888_TO_2x0565 t1, t2, t3, t8, t5, t6, t7, s0, s1 + + sh t3, 0(a0) + sh t8, 2(a0) + + addiu a2, a2, -2 + addiu t1, a2, -1 + bgtz t1, 2b + addiu a0, a0, 4 +3: + beqz a2, 4f + nop + + lhu t1, 0(a0) /* t1 = destination (r5g6b5) */ + + CONVERT_1x0565_TO_1x8888 t1, t2, s0, s1 + MIPS_UN8x4_MUL_UN8 t2, t0, t1, t4, s0, s1, s2 + addu_s.qb t1, t1, a1 + CONVERT_1x8888_TO_1x0565 t1, t2, s0, s1 + + sh t2, 0(a0) + +4: + RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +5: + j ra + nop + +END(pixman_composite_over_n_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - w + */ + + beqz a2, 5f + nop + + not t0, a1 + srl t0, t0, 24 + bgtz t0, 1f + nop +0: + sw a1, 0(a0) + addiu a2, a2, -1 + bgtz a2, 0b + addiu a0, a0, 4 + j ra + nop + +1: + SAVE_REGS_ON_STACK 0, s0, s1, s2 + li t4, 0x00ff00ff + addiu t1, a2, -1 + beqz t1, 3f + nop +2: + lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */ + lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */ + + MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t0, t7, t8, t4, t9, s0, s1, s2, t2, t3 + + addu_s.qb t7, t7, a1 + addu_s.qb t8, t8, a1 + + sw t7, 0(a0) + sw t8, 4(a0) + + addiu a2, a2, -2 + addiu t1, a2, -1 + bgtz t1, 2b + addiu a0, a0, 8 +3: + beqz a2, 4f + nop + + lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */ + + MIPS_UN8x4_MUL_UN8 t1, t0, t3, t4, t5, t6, t7 + + addu_s.qb t3, t3, a1 + + sw t3, 0(a0) + +4: + RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +5: + j ra + nop + +END(pixman_composite_over_n_8888_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips) /* * a0 - dst (a8) * a1 - src (a8) * a2 - mask (a8) * a3 - w */ @@ -1828,16 +2095,517 @@ 3: bnez a2, 3b addiu a0, a0, 4 4: jr ra nop END(pixman_composite_add_8888_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8) + * a2 - w + */ + + beqz a2, 4f + nop + + SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 + li t2, 0xf800f800 + li t3, 0x07e007e0 + li t4, 0x001F001F + li t5, 0x00ff00ff + + addiu t1, a2, -1 + beqz t1, 2f + nop +1: + lbu t0, 0(a1) /* t0 = source (a8) */ + lbu t1, 1(a1) /* t1 = source (a8) */ + lhu t6, 0(a0) /* t6 = destination (r5g6b5) */ + lhu t7, 2(a0) /* t7 = destination (r5g6b5) */ + addiu a1, a1, 2 + + not t0, t0 + not t1, t1 + andi t0, 0xff /* t0 = neg source1 */ + andi t1, 0xff /* t1 = neg source2 */ + CONVERT_2x0565_TO_2x8888 t6, t7, t8, t9, t3, t4, s0, s1, s2, s3 + MIPS_2xUN8x4_MUL_2xUN8 t8, t9, t0, t1, t6, t7, t5, s0, s1, s2, s3, t8, t9 + CONVERT_2x8888_TO_2x0565 t6, t7, t8, t9, t2, t3, t4, s0, s1 + + sh t8, 0(a0) + sh t9, 2(a0) + addiu a2, a2, -2 + addiu t1, a2, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a2, 3f + nop + lbu t0, 0(a1) /* t0 = source (a8) */ + lhu t1, 0(a0) /* t1 = destination (r5g6b5) */ + + not t0, t0 + andi t0, 0xff /* t0 = neg source */ + CONVERT_1x0565_TO_1x8888 t1, t2, t3, t4 + MIPS_UN8x4_MUL_UN8 t2, t0, t1, t5, t3, t4, t6 + CONVERT_1x8888_TO_1x0565 t1, t2, t3, t4 + + sh t2, 0(a0) +3: + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 +4: + j ra + nop + +END(pixman_composite_out_reverse_8_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (a8) + * a2 - w + */ + + beqz a2, 3f + nop + li t4, 0x00ff00ff + addiu t1, a2, -1 + beqz t1, 2f + nop +1: + lbu t0, 0(a1) /* t0 = source (a8) */ + lbu t1, 1(a1) /* t1 = source (a8) */ + lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */ + lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */ + addiu a1, a1, 2 + not t0, t0 + not t1, t1 + andi t0, 0xff /* t0 = neg source */ + andi t1, 0xff /* t1 = neg source */ + + MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t5, t6, t4, t7, t8, t9, t2, t3, t0 + + sw t5, 0(a0) + sw t6, 4(a0) + addiu a2, a2, -2 + addiu t1, a2, -1 + bgtz t1, 1b + addiu a0, a0, 8 +2: + beqz a2, 3f + nop + lbu t0, 0(a1) /* t0 = source (a8) */ + lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */ + not t0, t0 + andi t0, 0xff /* t0 = neg source */ + + MIPS_UN8x4_MUL_UN8 t1, t0, t2, t4, t3, t5, t6 + + sw t2, 0(a0) +3: + j ra + nop + +END(pixman_composite_out_reverse_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_reverse_n_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - w + */ + + beqz a2, 5f + nop + + SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 + li t0, 0x00ff00ff + srl t9, a2, 2 /* t9 = how many multiples of 4 src pixels */ + beqz t9, 2f /* branch if less than 4 src pixels */ + nop +1: + beqz t9, 2f + addiu t9, t9, -1 + + lw t1, 0(a0) + lw t2, 4(a0) + lw t3, 8(a0) + lw t4, 12(a0) + + addiu a2, a2, -4 + + not t5, t1 + not t6, t2 + not t7, t3 + not t8, t4 + srl t5, t5, 24 + srl t6, t6, 24 + srl t7, t7, 24 + srl t8, t8, 24 + replv.ph t5, t5 + replv.ph t6, t6 + replv.ph t7, t7 + replv.ph t8, t8 + muleu_s.ph.qbl s0, a1, t5 + muleu_s.ph.qbr s1, a1, t5 + muleu_s.ph.qbl s2, a1, t6 + muleu_s.ph.qbr s3, a1, t6 + muleu_s.ph.qbl s4, a1, t7 + muleu_s.ph.qbr s5, a1, t7 + muleu_s.ph.qbl s6, a1, t8 + muleu_s.ph.qbr s7, a1, t8 + + shra_r.ph t5, s0, 8 + shra_r.ph t6, s1, 8 + shra_r.ph t7, s2, 8 + shra_r.ph t8, s3, 8 + and t5, t5, t0 + and t6, t6, t0 + and t7, t7, t0 + and t8, t8, t0 + addq.ph s0, s0, t5 + addq.ph s1, s1, t6 + addq.ph s2, s2, t7 + addq.ph s3, s3, t8 + shra_r.ph s0, s0, 8 + shra_r.ph s1, s1, 8 + shra_r.ph s2, s2, 8 + shra_r.ph s3, s3, 8 + shra_r.ph t5, s4, 8 + shra_r.ph t6, s5, 8 + shra_r.ph t7, s6, 8 + shra_r.ph t8, s7, 8 + and t5, t5, t0 + and t6, t6, t0 + and t7, t7, t0 + and t8, t8, t0 + addq.ph s4, s4, t5 + addq.ph s5, s5, t6 + addq.ph s6, s6, t7 + addq.ph s7, s7, t8 + shra_r.ph s4, s4, 8 + shra_r.ph s5, s5, 8 + shra_r.ph s6, s6, 8 + shra_r.ph s7, s7, 8 + + precr.qb.ph t5, s0, s1 + precr.qb.ph t6, s2, s3 + precr.qb.ph t7, s4, s5 + precr.qb.ph t8, s6, s7 + addu_s.qb t5, t1, t5 + addu_s.qb t6, t2, t6 + addu_s.qb t7, t3, t7 + addu_s.qb t8, t4, t8 + + sw t5, 0(a0) + sw t6, 4(a0) + sw t7, 8(a0) + sw t8, 12(a0) + b 1b + addiu a0, a0, 16 + +2: + beqz a2, 4f + nop +3: + lw t1, 0(a0) + + not t2, t1 + srl t2, t2, 24 + replv.ph t2, t2 + + muleu_s.ph.qbl t4, a1, t2 + muleu_s.ph.qbr t5, a1, t2 + shra_r.ph t6, t4, 8 + shra_r.ph t7, t5, 8 + + and t6,t6,t0 + and t7,t7,t0 + + addq.ph t8, t4, t6 + addq.ph t9, t5, t7 + + shra_r.ph t8, t8, 8 + shra_r.ph t9, t9, 8 + + precr.qb.ph t9, t8, t9 + + addu_s.qb t9, t1, t9 + sw t9, 0(a0) + + addiu a2, a2, -1 + bnez a2, 3b + addiu a0, a0, 4 +4: + RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 +5: + j ra + nop + +END(pixman_composite_over_reverse_n_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips) +/* + * a0 - dst (a8) + * a1 - src (a8r8g8b8) + * a2 - w + */ + + beqz a2, 5f + nop + + SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 + move t7, a1 + srl t5, t7, 24 + replv.ph t5, t5 + srl t9, a2, 2 /* t1 = how many multiples of 4 src pixels */ + beqz t9, 2f /* branch if less than 4 src pixels */ + nop + +1: + addiu t9, t9, -1 + addiu a2, a2, -4 + lbu t0, 0(a0) + lbu t1, 1(a0) + lbu t2, 2(a0) + lbu t3, 3(a0) + + muleu_s.ph.qbl s0, t0, t5 + muleu_s.ph.qbr s1, t0, t5 + muleu_s.ph.qbl s2, t1, t5 + muleu_s.ph.qbr s3, t1, t5 + muleu_s.ph.qbl s4, t2, t5 + muleu_s.ph.qbr s5, t2, t5 + muleu_s.ph.qbl s6, t3, t5 + muleu_s.ph.qbr s7, t3, t5 + + shrl.ph t4, s0, 8 + shrl.ph t6, s1, 8 + shrl.ph t7, s2, 8 + shrl.ph t8, s3, 8 + addq.ph t0, s0, t4 + addq.ph t1, s1, t6 + addq.ph t2, s2, t7 + addq.ph t3, s3, t8 + shra_r.ph t0, t0, 8 + shra_r.ph t1, t1, 8 + shra_r.ph t2, t2, 8 + shra_r.ph t3, t3, 8 + shrl.ph t4, s4, 8 + shrl.ph t6, s5, 8 + shrl.ph t7, s6, 8 + shrl.ph t8, s7, 8 + addq.ph s0, s4, t4 + addq.ph s1, s5, t6 + addq.ph s2, s6, t7 + addq.ph s3, s7, t8 + shra_r.ph t4, s0, 8 + shra_r.ph t6, s1, 8 + shra_r.ph t7, s2, 8 + shra_r.ph t8, s3, 8 + + precr.qb.ph s0, t0, t1 + precr.qb.ph s1, t2, t3 + precr.qb.ph s2, t4, t6 + precr.qb.ph s3, t7, t8 + + sb s0, 0(a0) + sb s1, 1(a0) + sb s2, 2(a0) + sb s3, 3(a0) + bgtz t9, 1b + addiu a0, a0, 4 +2: + beqz a2, 4f + nop +3: + lbu t1, 0(a0) + + muleu_s.ph.qbl t4, t1, t5 + muleu_s.ph.qbr t7, t1, t5 + shrl.ph t6, t4, 8 + shrl.ph t0, t7, 8 + addq.ph t8, t4, t6 + addq.ph t9, t7, t0 + shra_r.ph t8, t8, 8 + shra_r.ph t9, t9, 8 + precr.qb.ph t2, t8, t9 + sb t2, 0(a0) + addiu a2, a2, -1 + bnez a2, 3b + addiu a0, a0, 1 +4: + RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 +5: + j ra + nop + +END(pixman_composite_in_n_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - mask (a8) + * a3 - w + * 16(sp) - vx + * 20(sp) - unit_x + */ + beqz a3, 4f + nop + + SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 + lw v0, 36(sp) /* v0 = vx */ + lw v1, 40(sp) /* v1 = unit_x */ + li t6, 0x00ff00ff + li t7, 0xf800f800 + li t8, 0x07e007e0 + li t9, 0x001F001F + + addiu t1, a3, -1 + beqz t1, 2f + nop +1: + sra t0, v0, 16 /* t0 = vx >> 16 */ + sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */ + addu t0, a1, t0 + lw t0, 0(t0) /* t0 = source (a8r8g8b8) */ + addu v0, v0, v1 /* v0 = vx + unit_x */ + sra t1, v0, 16 /* t1 = vx >> 16 */ + sll t1, t1, 2 /* t1 = t1 * 4 (a8r8g8b8) */ + addu t1, a1, t1 + lw t1, 0(t1) /* t1 = source (a8r8g8b8) */ + addu v0, v0, v1 /* v0 = vx + unit_x */ + lbu t2, 0(a2) /* t2 = mask (a8) */ + lbu t3, 1(a2) /* t3 = mask (a8) */ + lhu t4, 0(a0) /* t4 = destination (r5g6b5) */ + lhu t5, 2(a0) /* t5 = destination (r5g6b5) */ + addiu a2, a2, 2 + + CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5 + OVER_2x8888_2x8_2x8888 t0, t1, \ + t2, t3, \ + s0, s1, \ + t4, t5, \ + t6, s2, s3, s4, s5, t2, t3 + CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3 + + sh s0, 0(a0) + sh s1, 2(a0) + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a3, 3f + nop + sra t0, v0, 16 /* t0 = vx >> 16 */ + sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */ + addu t0, a1, t0 + lw t0, 0(t0) /* t0 = source (a8r8g8b8) */ + lbu t1, 0(a2) /* t1 = mask (a8) */ + lhu t2, 0(a0) /* t2 = destination (r5g6b5) */ + + CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5 + OVER_8888_8_8888 t0, t1, t3, t2, t6, t4, t5, t7, t8 + CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5 + + sh t3, 0(a0) +3: + RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 +4: + j ra + nop + +END(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (r5g6b5) + * a2 - mask (a8) + * a3 - w + * 16(sp) - vx + * 20(sp) - unit_x + */ + + beqz a3, 4f + nop + SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 + lw v0, 36(sp) /* v0 = vx */ + lw v1, 40(sp) /* v1 = unit_x */ + li t4, 0xf800f800 + li t5, 0x07e007e0 + li t6, 0x001F001F + li t7, 0x00ff00ff + + addiu t1, a3, -1 + beqz t1, 2f + nop +1: + sra t0, v0, 16 /* t0 = vx >> 16 */ + sll t0, t0, 1 /* t0 = t0 * 2 (r5g6b5) */ + addu t0, a1, t0 + lhu t0, 0(t0) /* t0 = source (r5g6b5) */ + addu v0, v0, v1 /* v0 = vx + unit_x */ + sra t1, v0, 16 /* t1 = vx >> 16 */ + sll t1, t1, 1 /* t1 = t1 * 2 (r5g6b5) */ + addu t1, a1, t1 + lhu t1, 0(t1) /* t1 = source (r5g6b5) */ + addu v0, v0, v1 /* v0 = vx + unit_x */ + lbu t2, 0(a2) /* t2 = mask (a8) */ + lbu t3, 1(a2) /* t3 = mask (a8) */ + lhu t8, 0(a0) /* t8 = destination (r5g6b5) */ + lhu t9, 2(a0) /* t9 = destination (r5g6b5) */ + addiu a2, a2, 2 + + CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5 + CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1 + OVER_2x8888_2x8_2x8888 s0, s1, \ + t2, t3, \ + s2, s3, \ + t0, t1, \ + t7, t8, t9, s4, s5, s0, s1 + CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3 + + sh s0, 0(a0) + sh s1, 2(a0) + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a3, 3f + nop + sra t0, v0, 16 /* t0 = vx >> 16 */ + sll t0, t0, 1 /* t0 = t0 * 2 (r5g6b5) */ + addu t0, a1, t0 + + lhu t0, 0(t0) /* t0 = source (r5g6b5) */ + lbu t1, 0(a2) /* t1 = mask (a8) */ + lhu t2, 0(a0) /* t2 = destination (r5g6b5) */ + + CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5 + CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6 + OVER_8888_8_8888 t3, t1, t4, t0, t7, t2, t5, t6, t8 + CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5 + + sh t3, 0(a0) +3: + RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 +4: + j ra + nop + +END(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips) + LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips) /* * a0 - *dst * a1 - *src_top * a2 - *src_bottom * a3 - w * 16(sp) - wt * 20(sp) - wb
--- a/gfx/cairo/libpixman/src/pixman-mips-dspr2.c +++ b/gfx/cairo/libpixman/src/pixman-mips-dspr2.c @@ -49,17 +49,25 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_F PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, uint8_t, 3, uint8_t, 3) PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888, uint32_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_0565, + uint8_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_8888, + uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8888, + uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca, uint32_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca, uint32_t, 1, uint16_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565, uint8_t, 1, uint16_t, 1) @@ -72,16 +80,25 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SK uint32_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565, uint32_t, 1, uint16_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565, uint16_t, 1, uint16_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, add_8888_n_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_0565, + uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_8888, + uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_reverse_n_8888, + uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (0, in_n_8, + uint8_t, 1) + PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t, 1, uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1, uint8_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8888_8888, uint32_t, 1, uint32_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_0565_8_0565, uint16_t, 1, uint8_t, 1, uint16_t, 1) @@ -102,16 +119,21 @@ PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST uint16_t, uint32_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 0565_0565, SRC, uint16_t, uint16_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, OVER, uint32_t, uint32_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, ADD, uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_0565, + OVER, uint32_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 0565_8_0565, + OVER, uint16_t, uint16_t) + PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC, uint32_t, uint32_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC, uint32_t, uint16_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_x888, SRC, uint16_t, uint32_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_0565, SRC, uint16_t, uint16_t) @@ -251,30 +273,37 @@ static const pixman_fast_path_t mips_dsp PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888), PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mips_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mips_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mips_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mips_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, mips_composite_src_n_8_8), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565), - + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mips_composite_over_n_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mips_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mips_composite_over_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mips_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mips_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, mips_composite_over_8888_n_0565), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, mips_composite_over_8888_n_0565), PIXMAN_STD_FAST_PATH (OVER, r5g6b5, solid, r5g6b5, mips_composite_over_0565_n_0565), PIXMAN_STD_FAST_PATH (OVER, b5g6r5, solid, b5g6r5, mips_composite_over_0565_n_0565), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, mips_composite_over_8888_8_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, mips_composite_over_8888_8_8888), @@ -298,16 +327,29 @@ static const pixman_fast_path_t mips_dsp PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, mips_composite_add_8888_8_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, mips_composite_add_8888_8_8888), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_add_8888_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, mips_composite_add_8888_n_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, mips_composite_add_8888_n_8888), PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mips_composite_add_8_8), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mips_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mips_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, mips_composite_out_reverse_8_0565), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, mips_composite_out_reverse_8_0565), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8r8g8b8, mips_composite_out_reverse_8_8888), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8b8g8r8, mips_composite_out_reverse_8_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mips_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, mips_composite_in_n_8), + + PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565), + PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565), + + PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565), + PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_0565), SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_0565),
--- a/gfx/cairo/libpixman/src/pixman-mips-dspr2.h +++ b/gfx/cairo/libpixman/src/pixman-mips-dspr2.h @@ -80,16 +80,52 @@ mips_composite_##name (pixman_implementa \ if (flags == DO_FAST_MEMCPY) \ pixman_mips_fast_memcpy (dst, src, width * bpp); \ else \ pixman_composite_##name##_asm_mips (dst, src, width); \ } \ } +/****************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_N_DST(flags, name, \ + dst_type, dst_cnt) \ +void \ +pixman_composite_##name##_asm_mips (dst_type *dst, \ + uint32_t src, \ + int32_t w); \ + \ +static void \ +mips_composite_##name (pixman_implementation_t *imp, \ + pixman_composite_info_t *info) \ +{ \ + PIXMAN_COMPOSITE_ARGS (info); \ + dst_type *dst_line, *dst; \ + int32_t dst_stride; \ + uint32_t src; \ + \ + src = _pixman_image_get_solid ( \ + imp, src_image, dest_image->bits.format); \ + \ + if ((flags & SKIP_ZERO_SRC) && src == 0) \ + return; \ + \ + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ + dst_stride, dst_line, dst_cnt); \ + \ + while (height--) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + \ + pixman_composite_##name##_asm_mips (dst, src, width); \ + } \ +} + /*******************************************************************/ #define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name, \ mask_type, mask_cnt, \ dst_type, dst_cnt) \ void \ pixman_composite_##name##_asm_mips (dst_type *dst, \ uint32_t src, \ @@ -205,16 +241,62 @@ mips_composite_##name (pixman_implementa mask = mask_line; \ mask_line += mask_stride; \ src = src_line; \ src_line += src_stride; \ pixman_composite_##name##_asm_mips (dst, src, mask, width); \ } \ } +/*****************************************************************************/ + +#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op, \ + src_type, dst_type) \ +void \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_mips ( \ + dst_type * dst, \ + const src_type * src, \ + const uint8_t * mask, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x); \ + \ +static force_inline void \ +scaled_nearest_scanline_mips_##name##_##op (const uint8_t * mask, \ + dst_type * pd, \ + const src_type * ps, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t zero_src) \ +{ \ + if ((flags & SKIP_ZERO_SRC) && zero_src) \ + return; \ + pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, \ + mask, w, \ + vx, unit_x); \ +} \ + \ +FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_cover_##op, \ + scaled_nearest_scanline_mips_##name##_##op, \ + src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ +FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_none_##op, \ + scaled_nearest_scanline_mips_##name##_##op, \ + src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_pad_##op, \ + scaled_nearest_scanline_mips_##name##_##op, \ + src_type, uint8_t, dst_type, PAD, TRUE, FALSE) + +/* Provide entries for the fast path table */ +#define PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + /****************************************************************************/ #define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST(flags, name, op, \ src_type, dst_type) \ void \ pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips( \ dst_type * dst, \ const src_type * src_top, \
--- a/gfx/cairo/libpixman/src/pixman-mmx.c +++ b/gfx/cairo/libpixman/src/pixman-mmx.c @@ -57,17 +57,17 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_empty (void) { } #endif #ifdef USE_X86_MMX -# if (defined(__SUNPRO_C) || defined(_MSC_VER)) +# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) # include <xmmintrin.h> # else /* We have to compile with -msse to use xmmintrin.h, but that causes SSE * instructions to be generated that we don't want. Just duplicate the * functions we want to use. */ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { @@ -1397,17 +1397,17 @@ mmx_composite_over_n_8888 (pixman_implem while (height--) { dst = dst_line; dst_line += dst_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { store8888 (dst, over (vsrc, vsrca, load8888 (dst))); w--; dst++; } while (w >= 2) @@ -1463,17 +1463,17 @@ mmx_composite_over_n_0565 (pixman_implem while (height--) { dst = dst_line; dst_line += dst_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); *dst = to_uint64 (vdest); w--; @@ -1541,17 +1541,17 @@ mmx_composite_over_n_8888_8888_ca (pixma vsrca = expand_alpha (vsrc); while (height--) { int twidth = width; uint32_t *p = (uint32_t *)mask_line; uint32_t *q = (uint32_t *)dst_line; - while (twidth && (unsigned long)q & 7) + while (twidth && (uintptr_t)q & 7) { uint32_t m = *(uint32_t *)p; if (m) { __m64 vdest = load8888 (q); vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); store8888 (q, vdest); @@ -1632,17 +1632,17 @@ mmx_composite_over_8888_n_8888 (pixman_i while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { __m64 s = load8888 (src); __m64 d = load8888 (dst); store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); w--; dst++; @@ -1702,17 +1702,17 @@ mmx_composite_over_x888_n_8888 (pixman_i while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { uint32_t ssrc = *src | 0xff000000; __m64 s = load8888 (&ssrc); __m64 d = load8888 (dst); store8888 (dst, in_over (s, srca, vmask, d)); w--; @@ -1876,17 +1876,17 @@ mmx_composite_over_8888_0565 (pixman_imp dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { __m64 vsrc = load8888 (src); uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); vdest = pack_565 ( over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); @@ -1979,17 +1979,17 @@ mmx_composite_over_n_8_8888 (pixman_impl dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { uint64_t m = *mask; if (m) { __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), load8888 (dst)); @@ -2059,17 +2059,17 @@ static pixman_bool_t mmx_fill (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { uint64_t fill; __m64 vfill; uint32_t byte_width; uint8_t *byte_line; #if defined __GNUC__ && defined USE_X86_MMX __m64 v1, v2, v3, v4, v5, v6, v7; @@ -2079,35 +2079,35 @@ mmx_fill (pixman_implementation_t *imp, return FALSE; if (bpp == 8) { stride = stride * (int) sizeof (uint32_t) / 1; byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); byte_width = width; stride *= 1; - xor = (xor & 0xff) * 0x01010101; + filler = (filler & 0xff) * 0x01010101; } else if (bpp == 16) { stride = stride * (int) sizeof (uint32_t) / 2; byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); byte_width = 2 * width; stride *= 2; - xor = (xor & 0xffff) * 0x00010001; + filler = (filler & 0xffff) * 0x00010001; } else { stride = stride * (int) sizeof (uint32_t) / 4; byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); byte_width = 4 * width; stride *= 4; } - fill = ((uint64_t)xor << 32) | xor; + fill = ((uint64_t)filler << 32) | filler; vfill = to_m64 (fill); #if defined __GNUC__ && defined USE_X86_MMX __asm__ ( "movq %7, %0\n" "movq %7, %1\n" "movq %7, %2\n" "movq %7, %3\n" @@ -2122,33 +2122,33 @@ mmx_fill (pixman_implementation_t *imp, while (height--) { int w; uint8_t *d = byte_line; byte_line += stride; w = byte_width; - if (w >= 1 && ((unsigned long)d & 1)) + if (w >= 1 && ((uintptr_t)d & 1)) { - *(uint8_t *)d = (xor & 0xff); + *(uint8_t *)d = (filler & 0xff); w--; d++; } - if (w >= 2 && ((unsigned long)d & 3)) + if (w >= 2 && ((uintptr_t)d & 3)) { - *(uint16_t *)d = xor; + *(uint16_t *)d = filler; w -= 2; d += 2; } - while (w >= 4 && ((unsigned long)d & 7)) + while (w >= 4 && ((uintptr_t)d & 7)) { - *(uint32_t *)d = xor; + *(uint32_t *)d = filler; w -= 4; d += 4; } while (w >= 64) { #if defined __GNUC__ && defined USE_X86_MMX @@ -2177,30 +2177,30 @@ mmx_fill (pixman_implementation_t *imp, *(__m64*) (d + 56) = vfill; #endif w -= 64; d += 64; } while (w >= 4) { - *(uint32_t *)d = xor; + *(uint32_t *)d = filler; w -= 4; d += 4; } if (w >= 2) { - *(uint16_t *)d = xor; + *(uint16_t *)d = filler; w -= 2; d += 2; } if (w >= 1) { - *(uint8_t *)d = (xor & 0xff); + *(uint8_t *)d = (filler & 0xff); w--; d++; } } _mm_empty (); return TRUE; @@ -2222,20 +2222,20 @@ mmx_composite_src_x888_0565 (pixman_impl while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { s = *src++; - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); dst++; w--; } while (w >= 4) { __m64 vdest; __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); @@ -2248,17 +2248,17 @@ mmx_composite_src_x888_0565 (pixman_impl w -= 4; src += 4; dst += 4; } while (w) { s = *src++; - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); dst++; w--; } } _mm_empty (); } @@ -2300,17 +2300,17 @@ mmx_composite_src_n_8_8888 (pixman_imple dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { uint64_t m = *mask; if (m) { __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); store8888 (dst, vdest); @@ -2414,17 +2414,17 @@ mmx_composite_over_n_8_0565 (pixman_impl dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { uint64_t m = *mask; if (m) { uint64_t d = *dst; __m64 vd = to_m64 (d); __m64 vdest = in_over ( @@ -2531,17 +2531,17 @@ mmx_composite_over_pixbuf_0565 (pixman_i dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { __m64 vsrc = load8888 (src); uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); *dst = to_uint64 (vdest); @@ -2646,17 +2646,17 @@ mmx_composite_over_pixbuf_8888 (pixman_i while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { __m64 s = load8888 (src); __m64 d = load8888 (dst); store8888 (dst, over_rev_non_pre (s, d)); w--; dst++; @@ -2734,17 +2734,17 @@ mmx_composite_over_n_8888_0565_ca (pixma vsrca = expand_alpha (vsrc); while (height--) { int twidth = width; uint32_t *p = (uint32_t *)mask_line; uint16_t *q = (uint16_t *)dst_line; - while (twidth && ((unsigned long)q & 7)) + while (twidth && ((uintptr_t)q & 7)) { uint32_t m = *(uint32_t *)p; if (m) { uint64_t d = *q; __m64 vdest = expand565 (to_m64 (d), 0); vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); @@ -2835,17 +2835,17 @@ mmx_composite_in_n_8_8 (pixman_implement while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { uint16_t tmp; uint8_t a; uint32_t m, d; a = *mask++; d = *dst; @@ -2906,17 +2906,17 @@ mmx_composite_in_8_8 (pixman_implementat while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 3) + while (w && (uintptr_t)dst & 3) { uint8_t s, d; uint16_t tmp; s = *src; d = *dst; *dst = MUL_UN8 (s, d, tmp); @@ -2985,17 +2985,17 @@ mmx_composite_add_n_8_8 (pixman_implemen while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 3) + while (w && (uintptr_t)dst & 3) { uint16_t tmp; uint16_t a; uint32_t m, d; uint32_t r; a = *mask++; d = *dst; @@ -3062,17 +3062,17 @@ mmx_composite_add_8_8 (pixman_implementa while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { s = *src; d = *dst; t = d + s; s = t | (0 - (t >> 8)); *dst = s; dst++; @@ -3125,29 +3125,29 @@ mmx_composite_add_0565_0565 (pixman_impl while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { s = *src++; if (s) { d = *dst; - s = CONVERT_0565_TO_8888 (s); + s = convert_0565_to_8888 (s); if (d) { - d = CONVERT_0565_TO_8888 (d); + d = convert_0565_to_8888 (d); UN8x4_ADD_UN8x4 (s, d); } - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); } dst++; w--; } while (w >= 4) { __m64 vdest = *(__m64 *)dst; @@ -3169,23 +3169,23 @@ mmx_composite_add_0565_0565 (pixman_impl } while (w--) { s = *src++; if (s) { d = *dst; - s = CONVERT_0565_TO_8888 (s); + s = convert_0565_to_8888 (s); if (d) { - d = CONVERT_0565_TO_8888 (d); + d = convert_0565_to_8888 (d); UN8x4_ADD_UN8x4 (s, d); } - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); } dst++; } } _mm_empty (); } @@ -3207,17 +3207,17 @@ mmx_composite_add_8888_8888 (pixman_impl while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), load ((const uint32_t *)dst))); dst++; src++; w--; } @@ -3291,33 +3291,33 @@ mmx_blt (pixman_implementation_t *imp, { int w; uint8_t *s = src_bytes; uint8_t *d = dst_bytes; src_bytes += src_stride; dst_bytes += dst_stride; w = byte_width; - if (w >= 1 && ((unsigned long)d & 1)) + if (w >= 1 && ((uintptr_t)d & 1)) { *(uint8_t *)d = *(uint8_t *)s; w -= 1; s += 1; d += 1; } - if (w >= 2 && ((unsigned long)d & 3)) + if (w >= 2 && ((uintptr_t)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; w -= 2; s += 2; d += 2; } - while (w >= 4 && ((unsigned long)d & 7)) + while (w >= 4 && ((uintptr_t)d & 7)) { *(uint32_t *)d = ldl_u ((uint32_t *)s); w -= 4; s += 4; d += 4; } @@ -3490,17 +3490,17 @@ mmx_composite_over_reverse_n_8888 (pixma while (height--) { dst = dst_line; dst_line += dst_stride; w = width; CHECKPOINT (); - while (w && (unsigned long)dst & 7) + while (w && (uintptr_t)dst & 7) { __m64 vdest = load8888 (dst); store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); w--; dst++; } @@ -3773,17 +3773,17 @@ static uint32_t * mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) { int w = iter->width; uint32_t *dst = iter->buffer; uint32_t *src = (uint32_t *)iter->bits; iter->bits += iter->stride; - while (w && ((unsigned long)dst) & 7) + while (w && ((uintptr_t)dst) & 7) { *dst++ = (*src++) | 0xff000000; w--; } while (w >= 8) { __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); @@ -3815,21 +3815,21 @@ static uint32_t * mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) { int w = iter->width; uint32_t *dst = iter->buffer; uint16_t *src = (uint16_t *)iter->bits; iter->bits += iter->stride; - while (w && ((unsigned long)dst) & 0x0f) + while (w && ((uintptr_t)dst) & 0x0f) { uint16_t s = *src++; - *dst++ = CONVERT_0565_TO_8888 (s); + *dst++ = convert_0565_to_8888 (s); w--; } while (w >= 4) { __m64 vsrc = ldq_u ((__m64 *)src); __m64 mm0, mm1; @@ -3842,34 +3842,34 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, c src += 4; w -= 4; } while (w) { uint16_t s = *src++; - *dst++ = CONVERT_0565_TO_8888 (s); + *dst++ = convert_0565_to_8888 (s); w--; } _mm_empty (); return iter->buffer; } static uint32_t * mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) { int w = iter->width; uint32_t *dst = iter->buffer; uint8_t *src = iter->bits; iter->bits += iter->stride; - while (w && (((unsigned long)dst) & 15)) + while (w && (((uintptr_t)dst) & 15)) { *dst++ = *(src++) << 24; w--; } while (w >= 8) { __m64 mm0 = ldq_u ((__m64 *)src);
--- a/gfx/cairo/libpixman/src/pixman-noop.c +++ b/gfx/cairo/libpixman/src/pixman-noop.c @@ -72,35 +72,43 @@ noop_src_iter_init (pixman_implementatio iter->get_scanline = get_scanline_null; } else if ((iter->iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) == (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) { iter->get_scanline = _pixman_iter_get_scanline_noop; } else if (image->common.extended_format_code == PIXMAN_solid && - ((iter->image_flags & (FAST_PATH_BITS_IMAGE | FAST_PATH_NO_ALPHA_MAP)) == - (FAST_PATH_BITS_IMAGE | FAST_PATH_NO_ALPHA_MAP))) + (iter->image->type == SOLID || + (iter->image_flags & FAST_PATH_NO_ALPHA_MAP))) { - bits_image_t *bits = &image->bits; - if (iter->iter_flags & ITER_NARROW) { - uint32_t color = bits->fetch_pixel_32 (bits, 0, 0); uint32_t *buffer = iter->buffer; uint32_t *end = buffer + iter->width; + uint32_t color; + + if (image->type == SOLID) + color = image->solid.color_32; + else + color = image->bits.fetch_pixel_32 (&image->bits, 0, 0); while (buffer < end) *(buffer++) = color; } else { - argb_t color = bits->fetch_pixel_float (bits, 0, 0); argb_t *buffer = (argb_t *)iter->buffer; argb_t *end = buffer + iter->width; + argb_t color; + + if (image->type == SOLID) + color = image->solid.color_float; + else + color = image->bits.fetch_pixel_float (&image->bits, 0, 0); while (buffer < end) *(buffer++) = color; } iter->get_scanline = _pixman_iter_get_scanline_noop; } else if (image->common.extended_format_code == PIXMAN_a8r8g8b8 &&
--- a/gfx/cairo/libpixman/src/pixman-ppc.c +++ b/gfx/cairo/libpixman/src/pixman-ppc.c @@ -32,38 +32,38 @@ * across function calls causing SIGILL on cpus without Altivec/vmx. */ #ifdef __APPLE__ #include <sys/sysctl.h> static pixman_bool_t pixman_have_vmx (void) { + int error, have_vmx; size_t length = sizeof(have_vmx); - int error, have_mmx; - sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0); + error = sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0); if (error) return FALSE; return have_vmx; } #elif defined (__OpenBSD__) #include <sys/param.h> #include <sys/sysctl.h> #include <machine/cpu.h> static pixman_bool_t pixman_have_vmx (void) { + int error, have_vmx; int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; size_t length = sizeof(have_vmx); - int error, have_vmx; error = sysctl (mib, 2, &have_vmx, &length, NULL, 0); if (error != 0) return FALSE; return have_vmx; }
--- a/gfx/cairo/libpixman/src/pixman-private.h +++ b/gfx/cairo/libpixman/src/pixman-private.h @@ -278,19 +278,16 @@ void void _pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter); void _pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter); void -_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter); - -void _pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); void _pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); void _pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); @@ -470,17 +467,17 @@ typedef pixman_bool_t (*pixman_blt_func_ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor); + uint32_t filler); typedef pixman_bool_t (*pixman_iter_init_func_t) (pixman_implementation_t *imp, pixman_iter_t *iter); void _pixman_setup_combiner_functions_16 (pixman_implementation_t *imp); void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp); void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp); typedef struct @@ -501,33 +498,34 @@ struct pixman_implementation_t pixman_implementation_t * fallback; const pixman_fast_path_t * fast_paths; pixman_blt_func_t blt; pixman_fill_func_t fill; pixman_iter_init_func_t src_iter_init; pixman_iter_init_func_t dest_iter_init; + pixman_combine_32_func_t combine_16[PIXMAN_N_OPERATORS]; + pixman_combine_32_func_t combine_16_ca[PIXMAN_N_OPERATORS]; pixman_combine_32_func_t combine_32[PIXMAN_N_OPERATORS]; pixman_combine_32_func_t combine_32_ca[PIXMAN_N_OPERATORS]; pixman_combine_float_func_t combine_float[PIXMAN_N_OPERATORS]; pixman_combine_float_func_t combine_float_ca[PIXMAN_N_OPERATORS]; - pixman_combine_32_func_t combine_16[PIXMAN_N_OPERATORS]; }; uint32_t _pixman_image_get_solid (pixman_implementation_t *imp, pixman_image_t * image, pixman_format_code_t format); pixman_implementation_t * _pixman_implementation_create (pixman_implementation_t *fallback, const pixman_fast_path_t *fast_paths); -pixman_bool_t +void _pixman_implementation_lookup_composite (pixman_implementation_t *toplevel, pixman_op_t op, pixman_format_code_t src_format, uint32_t src_flags, pixman_format_code_t mask_format, uint32_t mask_flags, pixman_format_code_t dest_format, uint32_t dest_flags, @@ -560,17 +558,17 @@ pixman_bool_t _pixman_implementation_fill (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor); + uint32_t filler); pixman_bool_t _pixman_implementation_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter, pixman_image_t *image, int x, int y, int width, @@ -705,17 +703,18 @@ uint32_t * #define FAST_PATH_Y_UNIT_ZERO (1 << 18) #define FAST_PATH_BILINEAR_FILTER (1 << 19) #define FAST_PATH_ROTATE_90_TRANSFORM (1 << 20) #define FAST_PATH_ROTATE_180_TRANSFORM (1 << 21) #define FAST_PATH_ROTATE_270_TRANSFORM (1 << 22) #define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST (1 << 23) #define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR (1 << 24) #define FAST_PATH_BITS_IMAGE (1 << 25) -#define FAST_PATH_16_FORMAT (1 << 26) +#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER (1 << 26) +#define FAST_PATH_16_FORMAT (1 << 27) #define FAST_PATH_PAD_REPEAT \ (FAST_PATH_NO_NONE_REPEAT | \ FAST_PATH_NO_NORMAL_REPEAT | \ FAST_PATH_NO_REFLECT_REPEAT) #define FAST_PATH_NORMAL_REPEAT \ (FAST_PATH_NO_NONE_REPEAT | \ @@ -902,32 +901,61 @@ pixman_list_move_to_front (pixman_list_t /* Modulus that produces the remainder wrt. DIV */ #define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b)) #define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v))) /* Conversion between 8888 and 0565 */ -#define CONVERT_8888_TO_0565(s) \ - ((((s) >> 3) & 0x001f) | \ - (((s) >> 5) & 0x07e0) | \ - (((s) >> 8) & 0xf800)) +static force_inline uint16_t +convert_8888_to_0565 (uint32_t s) +{ + /* The following code can be compiled into just 4 instructions on ARM */ + uint32_t a, b; + a = (s >> 3) & 0x1F001F; + b = s & 0xFC00; + a |= a >> 5; + a |= b >> 5; + return (uint16_t)a; +} -#define CONVERT_0565_TO_0888(s) \ - (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | \ - ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | \ - ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000))) +static force_inline uint32_t +convert_0565_to_0888 (uint16_t s) +{ + return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | + ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | + ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000))); +} -#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000) +static force_inline uint32_t +convert_0565_to_8888 (uint16_t s) +{ + return convert_0565_to_0888 (s) | 0xff000000; +} /* Trivial versions that are useful in macros */ -#define CONVERT_8888_TO_8888(s) (s) -#define CONVERT_x888_TO_8888(s) ((s) | 0xff000000) -#define CONVERT_0565_TO_0565(s) (s) + +static force_inline uint32_t +convert_8888_to_8888 (uint32_t s) +{ + return s; +} + +static force_inline uint32_t +convert_x888_to_8888 (uint32_t s) +{ + return s | 0xff000000; +} + +static force_inline uint16_t +convert_0565_to_0565 (uint16_t s) +{ + return s; +} #define PIXMAN_FORMAT_IS_WIDE(f) \ (PIXMAN_FORMAT_A (f) > 8 || \ PIXMAN_FORMAT_R (f) > 8 || \ PIXMAN_FORMAT_G (f) > 8 || \ PIXMAN_FORMAT_B (f) > 8 || \ PIXMAN_FORMAT_TYPE (f) == PIXMAN_TYPE_ARGB_SRGB) @@ -1044,17 +1072,17 @@ void if (!(expr)) \ _pixman_log_error (FUNC, "The expression " # expr " was false"); \ } \ while (0) #else -#define _pixman_log_error(f,m) do { } while (0) \ +#define _pixman_log_error(f,m) do { } while (0) #define return_if_fail(expr) \ do \ { \ if (!(expr)) \ return; \ } \ while (0) @@ -1070,16 +1098,37 @@ void #define critical_if_fail(expr) \ do \ { \ } \ while (0) #endif /* + * Matrix + */ + +typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t; + +pixman_bool_t +pixman_transform_point_31_16 (const pixman_transform_t *t, + const pixman_vector_48_16_t *v, + pixman_vector_48_16_t *result); + +void +pixman_transform_point_31_16_3d (const pixman_transform_t *t, + const pixman_vector_48_16_t *v, + pixman_vector_48_16_t *result); + +void +pixman_transform_point_31_16_affine (const pixman_transform_t *t, + const pixman_vector_48_16_t *v, + pixman_vector_48_16_t *result); + +/* * Timers */ #ifdef PIXMAN_TIMERS static inline uint64_t oil_profile_stamp_rdtsc (void) {
--- a/gfx/cairo/libpixman/src/pixman-radial-gradient.c +++ b/gfx/cairo/libpixman/src/pixman-radial-gradient.c @@ -106,17 +106,17 @@ radial_compute_color (double t = pixman_fixed_1 / 2 * c / b; if (repeat == PIXMAN_REPEAT_NONE) { if (0 <= t && t <= pixman_fixed_1) return _pixman_gradient_walker_pixel (walker, t); } else { - if (t * dr > mindr) + if (t * dr >= mindr) return _pixman_gradient_walker_pixel (walker, t); } return 0; } discr = fdot (b, a, 0, b, -c, 0); if (discr >= 0) @@ -142,19 +142,19 @@ radial_compute_color (double { if (0 <= t0 && t0 <= pixman_fixed_1) return _pixman_gradient_walker_pixel (walker, t0); else if (0 <= t1 && t1 <= pixman_fixed_1) return _pixman_gradient_walker_pixel (walker, t1); } else { - if (t0 * dr > mindr) + if (t0 * dr >= mindr) return _pixman_gradient_walker_pixel (walker, t0); - else if (t1 * dr > mindr) + else if (t1 * dr >= mindr) return _pixman_gradient_walker_pixel (walker, t1); } } return 0; } static uint32_t * @@ -397,21 +397,16 @@ radial_get_scanline_narrow (pixman_iter_ v.vector[2] += unit.vector[2]; } } iter->y++; return iter->buffer; } -static uint16_t convert_8888_to_0565(uint32_t color) -{ - return CONVERT_8888_TO_0565(color); -} - static uint32_t * radial_get_scanline_16 (pixman_iter_t *iter, const uint32_t *mask) { /* * Implementation of radial gradients following the PDF specification. * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference * Manual (PDF 32000-1:2008 at the time of this writing). *
--- a/gfx/cairo/libpixman/src/pixman-region.c +++ b/gfx/cairo/libpixman/src/pixman-region.c @@ -197,17 +197,17 @@ PIXREGION_SZOF (size_t n) return 0; if (sizeof(region_data_type_t) > UINT32_MAX - size) return 0; return size + sizeof(region_data_type_t); } -static void * +static region_data_type_t * alloc_data (size_t n) { size_t sz = PIXREGION_SZOF (n); if (!sz) return NULL; return malloc (sz);
--- a/gfx/cairo/libpixman/src/pixman-solid-fill.c +++ b/gfx/cairo/libpixman/src/pixman-solid-fill.c @@ -21,41 +21,16 @@ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #ifdef HAVE_CONFIG_H #include <config.h> #endif #include "pixman-private.h" -void -_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter) -{ - if (iter->iter_flags & ITER_NARROW) - { - uint32_t *b = (uint32_t *)iter->buffer; - uint32_t *e = b + iter->width; - uint32_t color = iter->image->solid.color_32; - - while (b < e) - *(b++) = color; - } - else - { - argb_t *b = (argb_t *)iter->buffer; - argb_t *e = b + iter->width; - argb_t color = image->solid.color_float; - - while (b < e) - *(b++) = color; - } - - iter->get_scanline = _pixman_iter_get_scanline_noop; -} - static uint32_t color_to_uint32 (const pixman_color_t *color) { return (color->alpha >> 8 << 24) | (color->red >> 8 << 16) | (color->green & 0xff00) | (color->blue >> 8);
--- a/gfx/cairo/libpixman/src/pixman-sse2.c +++ b/gfx/cairo/libpixman/src/pixman-sse2.c @@ -571,17 +571,17 @@ static force_inline void core_combine_over_u_sse2_mask (uint32_t * pd, const uint32_t* ps, const uint32_t* pm, int w) { uint32_t s, d; /* Align dst on a 16-byte boundary */ - while (w && ((unsigned long)pd & 15)) + while (w && ((uintptr_t)pd & 15)) { d = *pd; s = combine1 (ps, pm); if (s) *pd = core_combine_over_u_pixel_sse2 (s, d); pd++; ps++; @@ -656,17 +656,17 @@ core_combine_over_u_sse2_mask (uint32_t static force_inline void core_combine_over_u_sse2_no_mask (uint32_t * pd, const uint32_t* ps, int w) { uint32_t s, d; /* Align dst on a 16-byte boundary */ - while (w && ((unsigned long)pd & 15)) + while (w && ((uintptr_t)pd & 15)) { d = *pd; s = *ps; if (s) *pd = core_combine_over_u_pixel_sse2 (s, d); pd++; ps++; @@ -748,17 +748,17 @@ sse2_combine_over_reverse_u (pixman_impl uint32_t s, d; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; /* Align dst on a 16-byte boundary */ while (w && - ((unsigned long)pd & 15)) + ((uintptr_t)pd & 15)) { d = *pd; s = combine1 (ps, pm); *pd++ = core_combine_over_u_pixel_sse2 (d, s); w--; ps++; if (pm) @@ -835,17 +835,17 @@ sse2_combine_in_u (pixman_implementation const uint32_t * pm, int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_in_u_pixel_sse2 (d, s); w--; ps++; if (pm) @@ -896,17 +896,17 @@ sse2_combine_in_reverse_u (pixman_implem const uint32_t * pm, int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_in_u_pixel_sse2 (s, d); ps++; w--; if (pm) @@ -952,17 +952,17 @@ sse2_combine_in_reverse_u (pixman_implem static void sse2_combine_out_reverse_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * pd, const uint32_t * ps, const uint32_t * pm, int w) { - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; *pd++ = pack_1x128_32 ( pix_multiply_1x128 ( unpack_32_1x128 (d), negate_1x128 ( expand_alpha_1x128 (unpack_32_1x128 (s))))); @@ -1021,17 +1021,17 @@ sse2_combine_out_reverse_u (pixman_imple static void sse2_combine_out_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * pd, const uint32_t * ps, const uint32_t * pm, int w) { - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; *pd++ = pack_1x128_32 ( pix_multiply_1x128 ( unpack_32_1x128 (s), negate_1x128 ( expand_alpha_1x128 (unpack_32_1x128 (d))))); @@ -1108,17 +1108,17 @@ sse2_combine_atop_u (pixman_implementati { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_atop_u_pixel_sse2 (s, d); w--; ps++; if (pm) @@ -1192,17 +1192,17 @@ sse2_combine_atop_reverse_u (pixman_impl { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); ps++; w--; if (pm) @@ -1280,17 +1280,17 @@ sse2_combine_xor_u (pixman_implementatio const uint32_t* ps = src; const uint32_t* pm = mask; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - while (w && ((unsigned long) pd & 15)) + while (w && ((uintptr_t)pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_xor_u_pixel_sse2 (s, d); w--; ps++; if (pm) @@ -1352,17 +1352,17 @@ sse2_combine_add_u (pixman_implementatio int width) { int w = width; uint32_t s, d; uint32_t* pd = dst; const uint32_t* ps = src; const uint32_t* pm = mask; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = combine1 (ps, pm); d = *pd; ps++; if (pm) pm++; *pd++ = _mm_cvtsi128_si32 ( @@ -1425,17 +1425,17 @@ sse2_combine_saturate_u (pixman_implemen const uint32_t * pm, int w) { uint32_t s, d; uint32_t pack_cmp; __m128i xmm_src, xmm_dst; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); w--; ps++; if (pm) @@ -1513,17 +1513,17 @@ sse2_combine_src_ca (pixman_implementati int w) { uint32_t s, m; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; *pd++ = pack_1x128_32 ( pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } @@ -1581,17 +1581,17 @@ sse2_combine_over_ca (pixman_implementat { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); w--; } @@ -1657,17 +1657,17 @@ sse2_combine_over_reverse_ca (pixman_imp { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); w--; } @@ -1722,17 +1722,17 @@ sse2_combine_in_ca (pixman_implementatio { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x128_32 ( pix_multiply_1x128 ( pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), @@ -1797,17 +1797,17 @@ sse2_combine_in_reverse_ca (pixman_imple { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x128_32 ( pix_multiply_1x128 ( unpack_32_1x128 (d), @@ -1870,17 +1870,17 @@ sse2_combine_out_ca (pixman_implementati { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x128_32 ( pix_multiply_1x128 ( pix_multiply_1x128 ( @@ -1946,17 +1946,17 @@ sse2_combine_out_reverse_ca (pixman_impl { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x128_32 ( pix_multiply_1x128 ( unpack_32_1x128 (d), @@ -2043,17 +2043,17 @@ sse2_combine_atop_ca (pixman_implementat uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); w--; } @@ -2136,17 +2136,17 @@ sse2_combine_atop_reverse_ca (pixman_imp uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); w--; } @@ -2232,17 +2232,17 @@ sse2_combine_xor_ca (pixman_implementati uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); w--; } @@ -2308,17 +2308,17 @@ sse2_combine_add_ca (pixman_implementati int w) { uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x128_32 ( _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), @@ -2409,17 +2409,17 @@ sse2_composite_over_n_8888 (pixman_imple while (height--) { dst = dst_line; dst_line += dst_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { d = *dst; *dst++ = pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, unpack_32_1x128 (d))); w--; } @@ -2478,17 +2478,17 @@ sse2_composite_over_n_0565 (pixman_imple while (height--) { dst = dst_line; dst_line += dst_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { d = *dst; *dst++ = pack_565_32_16 ( pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, expand565_16_1x128 (d)))); w--; @@ -2563,17 +2563,17 @@ sse2_composite_add_n_8888_8888_ca (pixma { int w = width; const uint32_t *pm = (uint32_t *)mask_line; uint32_t *pd = (uint32_t *)dst_line; dst_line += dst_stride; mask_line += mask_stride; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { m = *pm++; if (m) { d = *pd; mmx_mask = unpack_32_1x128 (m); @@ -2677,17 +2677,17 @@ sse2_composite_over_n_8888_8888_ca (pixm { int w = width; const uint32_t *pm = (uint32_t *)mask_line; uint32_t *pd = (uint32_t *)dst_line; dst_line += dst_stride; mask_line += mask_stride; - while (w && (unsigned long)pd & 15) + while (w && (uintptr_t)pd & 15) { m = *pm++; if (m) { d = *pd; mmx_mask = unpack_32_1x128 (m); mmx_dest = unpack_32_1x128 (d); @@ -2781,17 +2781,17 @@ sse2_composite_over_8888_n_8888 (pixman_ while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint32_t s = *src++; if (s) { uint32_t d = *dst; __m128i ms = unpack_32_1x128 (s); @@ -2873,20 +2873,20 @@ sse2_composite_src_x888_0565 (pixman_imp while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { s = *src++; - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); dst++; w--; } while (w >= 8) { __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); @@ -2896,17 +2896,17 @@ sse2_composite_src_x888_0565 (pixman_imp w -= 8; src += 8; dst += 8; } while (w) { s = *src++; - *dst = CONVERT_8888_TO_0565 (s); + *dst = convert_8888_to_0565 (s); dst++; w--; } } } static void sse2_composite_src_x888_8888 (pixman_implementation_t *imp, @@ -2927,17 +2927,17 @@ sse2_composite_src_x888_8888 (pixman_imp while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { *dst++ = *src++ | 0xff000000; w--; } while (w >= 16) { __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; @@ -2994,17 +2994,17 @@ sse2_composite_over_x888_n_8888 (pixman_ while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; __m128i src = unpack_32_1x128 (s); __m128i alpha = xmm_alpha; __m128i mask = xmm_mask; __m128i dest = unpack_32_1x128 (d); @@ -3120,17 +3120,17 @@ sse2_composite_over_8888_0565 (pixman_im src = src_line; dst_line += dst_stride; src_line += src_stride; w = width; /* Align dst on a 16-byte boundary */ while (w && - ((unsigned long)dst & 15)) + ((uintptr_t)dst & 15)) { s = *src++; d = *dst; *dst++ = composite_over_8888_0565pixel (s, d); w--; } @@ -3226,17 +3226,17 @@ sse2_composite_over_n_8_8888 (pixman_imp while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint8_t m = *mask++; if (m) { d = *dst; mmx_mask = expand_pixel_8_1x128 (m); mmx_dest = unpack_32_1x128 (d); @@ -3316,84 +3316,84 @@ static pixman_bool_t sse2_fill (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { uint32_t byte_width; uint8_t *byte_line; __m128i xmm_def; if (bpp == 8) { uint8_t b; uint16_t w; stride = stride * (int) sizeof (uint32_t) / 1; byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); byte_width = width; stride *= 1; - b = xor & 0xff; + b = filler & 0xff; w = (b << 8) | b; - xor = (w << 16) | w; + filler = (w << 16) | w; } else if (bpp == 16) { stride = stride * (int) sizeof (uint32_t) / 2; byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); byte_width = 2 * width; stride *= 2; - xor = (xor & 0xffff) * 0x00010001; + filler = (filler & 0xffff) * 0x00010001; } else if (bpp == 32) { stride = stride * (int) sizeof (uint32_t) / 4; byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); byte_width = 4 * width; stride *= 4; } else { return FALSE; } - xmm_def = create_mask_2x32_128 (xor, xor); + xmm_def = create_mask_2x32_128 (filler, filler); while (height--) { int w; uint8_t *d = byte_line; byte_line += stride; w = byte_width; - if (w >= 1 && ((unsigned long)d & 1)) + if (w >= 1 && ((uintptr_t)d & 1)) { - *(uint8_t *)d = xor; + *(uint8_t *)d = filler; w -= 1; d += 1; } - while (w >= 2 && ((unsigned long)d & 3)) + while (w >= 2 && ((uintptr_t)d & 3)) { - *(uint16_t *)d = xor; + *(uint16_t *)d = filler; w -= 2; d += 2; } - while (w >= 4 && ((unsigned long)d & 15)) + while (w >= 4 && ((uintptr_t)d & 15)) { - *(uint32_t *)d = xor; + *(uint32_t *)d = filler; w -= 4; d += 4; } while (w >= 128) { save_128_aligned ((__m128i*)(d), xmm_def); @@ -3434,32 +3434,32 @@ sse2_fill (pixman_implementation_t *imp, save_128_aligned ((__m128i*)(d), xmm_def); d += 16; w -= 16; } while (w >= 4) { - *(uint32_t *)d = xor; + *(uint32_t *)d = filler; w -= 4; d += 4; } if (w >= 2) { - *(uint16_t *)d = xor; + *(uint16_t *)d = filler; w -= 2; d += 2; } if (w >= 1) { - *(uint8_t *)d = xor; + *(uint8_t *)d = filler; w -= 1; d += 1; } } return TRUE; } @@ -3500,17 +3500,17 @@ sse2_composite_src_n_8_8888 (pixman_impl while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint8_t m = *mask++; if (m) { *dst = pack_1x128_32 ( pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); } @@ -3616,17 +3616,17 @@ sse2_composite_over_n_8_0565 (pixman_imp while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { m = *mask++; if (m) { d = *dst; mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); mmx_dest = expand565_16_1x128 (d); @@ -3740,17 +3740,17 @@ sse2_composite_over_pixbuf_0565 (pixman_ while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { s = *src++; d = *dst; ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( pack_1x128_32 ( @@ -3849,17 +3849,17 @@ sse2_composite_over_pixbuf_8888 (pixman_ while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { s = *src++; d = *dst; *dst++ = pack_1x128_32 ( over_rev_non_pre_1x128 ( unpack_32_1x128 (s), unpack_32_1x128 (d))); @@ -3952,17 +3952,17 @@ sse2_composite_over_n_8888_0565_ca (pixm while (height--) { w = width; mask = mask_line; dst = dst_line; mask_line += mask_stride; dst_line += dst_stride; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { m = *(uint32_t *) mask; if (m) { d = *dst; mmx_mask = unpack_32_1x128 (m); mmx_dest = expand565_16_1x128 (d); @@ -4078,17 +4078,17 @@ sse2_composite_in_n_8_8 (pixman_implemen while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { m = (uint32_t) *mask++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x128_32 ( pix_multiply_1x128 ( pix_multiply_1x128 (xmm_alpha, unpack_32_1x128 (m)), @@ -4171,17 +4171,17 @@ sse2_composite_in_n_8 (pixman_implementa } while (height--) { dst = dst_line; dst_line += dst_stride; w = width; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x128_32 ( pix_multiply_1x128 ( xmm_alpha, unpack_32_1x128 (d))); w--; @@ -4240,17 +4240,17 @@ sse2_composite_in_8_8 (pixman_implementa while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { s = (uint32_t) *src++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x128_32 ( pix_multiply_1x128 ( unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; @@ -4317,17 +4317,17 @@ sse2_composite_add_n_8_8 (pixman_impleme while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { m = (uint32_t) *mask++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x128_32 ( _mm_adds_epu16 ( pix_multiply_1x128 ( xmm_alpha, unpack_32_1x128 (m)), @@ -4409,17 +4409,17 @@ sse2_composite_add_n_8 (pixman_implement xmm_src = _mm_set_epi32 (src, src, src, src); while (height--) { dst = dst_line; dst_line += dst_stride; w = width; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { *dst = (uint8_t)_mm_cvtsi128_si32 ( _mm_adds_epu8 ( xmm_src, _mm_cvtsi32_si128 (*dst))); w--; dst++; @@ -4469,17 +4469,17 @@ sse2_composite_add_8_8 (pixman_implement dst = dst_line; src = src_line; dst_line += dst_stride; src_line += src_stride; w = width; /* Small head */ - while (w && (unsigned long)dst & 3) + while (w && (uintptr_t)dst & 3) { t = (*dst) + (*src++); *dst++ = t | (0 - (t >> 8)); w--; } sse2_combine_add_u (imp, op, (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); @@ -4518,17 +4518,173 @@ sse2_composite_add_8888_8888 (pixman_imp { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; sse2_combine_add_u (imp, op, dst, src, NULL, width); } - +} + +static void +sse2_composite_add_n_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst, src; + int dst_stride; + + __m128i xmm_src; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + if (src == 0) + return; + + if (src == ~0) + { + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, + dest_x, dest_y, width, height, ~0); + + return; + } + + xmm_src = _mm_set_epi32 (src, src, src, src); + while (height--) + { + int w = width; + uint32_t d; + + dst = dst_line; + dst_line += dst_stride; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + *dst++ = + _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); + w--; + } + + while (w >= 4) + { + save_128_aligned + ((__m128i*)dst, + _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); + + dst += 4; + w -= 4; + } + + while (w--) + { + d = *dst; + *dst++ = + _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, + _mm_cvtsi32_si128 (d))); + } + } +} + +static void +sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + + __m128i xmm_src; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + if (src == 0) + return; + xmm_src = expand_pixel_32_1x128 (src); + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + uint8_t m = *mask++; + if (m) + { + *dst = pack_1x128_32 + (_mm_adds_epu16 + (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), + unpack_32_1x128 (*dst))); + } + dst++; + w--; + } + + while (w >= 4) + { + uint32_t m = *(uint32_t*)mask; + if (m) + { + __m128i xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + __m128i xmm_dst = load_128_aligned ((__m128i*)dst); + __m128i xmm_mask = + _mm_unpacklo_epi8 (unpack_32_1x128(m), + _mm_setzero_si128 ()); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + if (m) + { + *dst = pack_1x128_32 + (_mm_adds_epu16 + (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), + unpack_32_1x128 (*dst))); + } + dst++; + w--; + } + } } static pixman_bool_t sse2_blt (pixman_implementation_t *imp, uint32_t * src_bits, uint32_t * dst_bits, int src_stride, int dst_stride, @@ -4577,25 +4733,25 @@ sse2_blt (pixman_implementation_t *imp, { int w; uint8_t *s = src_bytes; uint8_t *d = dst_bytes; src_bytes += src_stride; dst_bytes += dst_stride; w = byte_width; - while (w >= 2 && ((unsigned long)d & 3)) + while (w >= 2 && ((uintptr_t)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; w -= 2; s += 2; d += 2; } - while (w >= 4 && ((unsigned long)d & 15)) + while (w >= 4 && ((uintptr_t)d & 15)) { *(uint32_t *)d = *(uint32_t *)s; w -= 4; s += 4; d += 4; } @@ -4692,17 +4848,17 @@ sse2_composite_over_x888_8_8888 (pixman_ src_line += src_stride; dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { s = 0xff000000 | *src++; m = (uint32_t) *mask++; d = *dst; ms = unpack_32_1x128 (s); if (m != 0xff) { @@ -4816,17 +4972,17 @@ sse2_composite_over_8888_8_8888 (pixman_ src_line += src_stride; dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint32_t sa; s = *src++; m = (uint32_t) *mask++; d = *dst; sa = s >> 24; @@ -4955,17 +5111,17 @@ sse2_composite_over_reverse_n_8888 (pixm while (height--) { dst = dst_line; dst_line += dst_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { __m128i vd; vd = unpack_32_1x128 (*dst); *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), xmm_src)); w--; @@ -5040,17 +5196,17 @@ sse2_composite_over_8888_8888_8888 (pixm src_line += src_stride; dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint32_t sa; s = *src++; m = (*mask++) >> 24; d = *dst; sa = s >> 24; @@ -5168,17 +5324,17 @@ scaled_nearest_scanline_sse2_8888_8888_O __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; if (fully_transparent_src) return; /* Align dst on a 16-byte boundary */ - while (w && ((unsigned long)pd & 15)) + while (w && ((uintptr_t)pd & 15)) { d = *pd; s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; while (vx >= 0) vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); @@ -5286,17 +5442,17 @@ scaled_nearest_scanline_sse2_8888_n_8888 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; if (zero_src || (*mask >> 24) == 0) return; xmm_mask = create_mask_16_128 (*mask >> 24); - while (w && (unsigned long)dst & 15) + while (w && (uintptr_t)dst & 15) { uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; while (vx >= 0) vx -= src_width_fixed; if (s) { @@ -5536,17 +5692,17 @@ scaled_bilinear_scanline_sse2_8888_8888_ pixman_fixed_t vx, pixman_fixed_t unit_x, pixman_fixed_t max_vx, pixman_bool_t zero_src) { BILINEAR_DECLARE_VARIABLES; uint32_t pix1, pix2, pix3, pix4; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); if (pix1) { pix2 = *dst; *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); } @@ -5671,17 +5827,17 @@ scaled_bilinear_scanline_sse2_8888_8_888 pixman_fixed_t unit_x, pixman_fixed_t max_vx, pixman_bool_t zero_src) { BILINEAR_DECLARE_VARIABLES; uint32_t pix1, pix2, pix3, pix4; uint32_t m; - while (w && ((unsigned long)dst & 15)) + while (w && ((uintptr_t)dst & 15)) { uint32_t sa; m = (uint32_t) *mask++; if (m) { BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); @@ -5818,16 +5974,131 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, uint32_t, uint8_t, uint32_t, NONE, FLAG_HAVE_NON_SOLID_MASK) FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, uint32_t, uint8_t, uint32_t, NORMAL, FLAG_HAVE_NON_SOLID_MASK) +static force_inline void +scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + uint32_t pix1, pix2, pix3, pix4; + __m128i xmm_mask; + + if (zero_src || (*mask >> 24) == 0) + return; + + xmm_mask = create_mask_16_128 (*mask >> 24); + + while (w && ((uintptr_t)dst & 15)) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + if (pix1) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (pix1); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 + (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); + } + + dst++; + w--; + } + + while (w >= 4) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + + if (pix1 | pix2 | pix3 | pix4) + { + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned + ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + w -= 4; + } + + while (w) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + if (pix1) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (pix1); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 + (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); + } + + dst++; + w--; + } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, + uint32_t, uint32_t, uint32_t, + COVER, FLAG_HAVE_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, + uint32_t, uint32_t, uint32_t, + PAD, FLAG_HAVE_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, + uint32_t, uint32_t, uint32_t, + NONE, FLAG_HAVE_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, + uint32_t, uint32_t, uint32_t, + NORMAL, FLAG_HAVE_SOLID_MASK) + static const pixman_fast_path_t sse2_fast_paths[] = { /* PIXMAN_OP_OVER */ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), @@ -5880,16 +6151,24 @@ static const pixman_fast_path_t sse2_fas /* PIXMAN_OP_ADD */ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), + PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), /* PIXMAN_OP_SRC */ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), @@ -5944,16 +6223,21 @@ static const pixman_fast_path_t sse2_fas SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), /* and here the needed entries are added to the fast path table */ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), @@ -5967,17 +6251,17 @@ sse2_fetch_x8r8g8b8 (pixman_iter_t *iter { int w = iter->width; __m128i ff000000 = mask_ff000000; uint32_t *dst = iter->buffer; uint32_t *src = (uint32_t *)iter->bits; iter->bits += iter->stride; - while (w && ((unsigned long)dst) & 0x0f) + while (w && ((uintptr_t)dst) & 0x0f) { *dst++ = (*src++) | 0xff000000; w--; } while (w >= 4) { save_128_aligned ( @@ -6003,21 +6287,21 @@ sse2_fetch_r5g6b5 (pixman_iter_t *iter, { int w = iter->width; uint32_t *dst = iter->buffer; uint16_t *src = (uint16_t *)iter->bits; __m128i ff000000 = mask_ff000000; iter->bits += iter->stride; - while (w && ((unsigned long)dst) & 0x0f) + while (w && ((uintptr_t)dst) & 0x0f) { uint16_t s = *src++; - *dst++ = CONVERT_0565_TO_8888 (s); + *dst++ = convert_0565_to_8888 (s); w--; } while (w >= 8) { __m128i lo, hi, s; s = _mm_loadu_si128 ((__m128i *)src); @@ -6032,34 +6316,34 @@ sse2_fetch_r5g6b5 (pixman_iter_t *iter, src += 8; w -= 8; } while (w) { uint16_t s = *src++; - *dst++ = CONVERT_0565_TO_8888 (s); + *dst++ = convert_0565_to_8888 (s); w--; } return iter->buffer; } static uint32_t * sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) { int w = iter->width; uint32_t *dst = iter->buffer; uint8_t *src = iter->bits; __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; iter->bits += iter->stride; - while (w && (((unsigned long)dst) & 15)) + while (w && (((uintptr_t)dst) & 15)) { *dst++ = *(src++) << 24; w--; } while (w >= 16) { xmm0 = _mm_loadu_si128((__m128i *)src);
--- a/gfx/cairo/libpixman/src/pixman-trap.c +++ b/gfx/cairo/libpixman/src/pixman-trap.c @@ -486,16 +486,18 @@ pixman_composite_trapezoids (pixman_op_t int y_src, int x_dst, int y_dst, int n_traps, const pixman_trapezoid_t * traps) { int i; + return_if_fail (PIXMAN_FORMAT_TYPE (mask_format) == PIXMAN_TYPE_A); + if (n_traps <= 0) return; _pixman_image_validate (src); _pixman_image_validate (dst); if (op == PIXMAN_OP_ADD && (src->common.flags & FAST_PATH_IS_OPAQUE) && @@ -516,18 +518,19 @@ pixman_composite_trapezoids (pixman_op_t { pixman_image_t *tmp; pixman_box32_t box; int i; if (!get_trap_extents (op, dst, traps, n_traps, &box)) return; - tmp = pixman_image_create_bits ( - mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1); + if (!(tmp = pixman_image_create_bits ( + mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1))) + return; for (i = 0; i < n_traps; ++i) { const pixman_trapezoid_t *trap = &(traps[i]); if (!pixman_trapezoid_valid (trap)) continue;
--- a/gfx/cairo/libpixman/src/pixman.c +++ b/gfx/cairo/libpixman/src/pixman.c @@ -450,16 +450,24 @@ analyze_extent (pixman_image_t *im case PIXMAN_FILTER_CONVOLUTION: params = image->common.filter_params; x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1); y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1); width = params[0]; height = params[1]; break; + case PIXMAN_FILTER_SEPARABLE_CONVOLUTION: + params = image->common.filter_params; + x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1); + y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1); + width = params[0]; + height = params[1]; + break; + case PIXMAN_FILTER_GOOD: case PIXMAN_FILTER_BEST: case PIXMAN_FILTER_BILINEAR: x_off = - pixman_fixed_1 / 2; y_off = - pixman_fixed_1 / 2; width = pixman_fixed_1; height = pixman_fixed_1; break; @@ -568,49 +576,51 @@ pixman_image_composite32 (pixman_op_t int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { pixman_format_code_t src_format, mask_format, dest_format; - uint32_t src_flags, mask_flags, dest_flags; pixman_region32_t region; pixman_box32_t extents; pixman_implementation_t *imp; pixman_composite_func_t func; + pixman_composite_info_t info; + const pixman_box32_t *pbox; + int n; _pixman_image_validate (src); if (mask) _pixman_image_validate (mask); _pixman_image_validate (dest); src_format = src->common.extended_format_code; - src_flags = src->common.flags; + info.src_flags = src->common.flags; - if (mask) + if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE)) { mask_format = mask->common.extended_format_code; - mask_flags = mask->common.flags; + info.mask_flags = mask->common.flags; } else { mask_format = PIXMAN_null; - mask_flags = FAST_PATH_IS_OPAQUE; + info.mask_flags = FAST_PATH_IS_OPAQUE; } dest_format = dest->common.extended_format_code; - dest_flags = dest->common.flags; + info.dest_flags = dest->common.flags; /* Check for pixbufs */ if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) && (src->type == BITS && src->bits.bits == mask->bits.bits) && (src->common.repeat == mask->common.repeat) && - (src_flags & mask_flags & FAST_PATH_ID_TRANSFORM) && + (info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM) && (src_x == mask_x && src_y == mask_y)) { if (src_format == PIXMAN_x8b8g8r8) src_format = mask_format = PIXMAN_pixbuf; else if (src_format == PIXMAN_x8r8g8b8) src_format = mask_format = PIXMAN_rpixbuf; } @@ -625,90 +635,83 @@ pixman_image_composite32 (pixman_op_t extents = *pixman_region32_extents (®ion); extents.x1 -= dest_x - src_x; extents.y1 -= dest_y - src_y; extents.x2 -= dest_x - src_x; extents.y2 -= dest_y - src_y; - if (!analyze_extent (src, &extents, &src_flags)) + if (!analyze_extent (src, &extents, &info.src_flags)) goto out; extents.x1 -= src_x - mask_x; extents.y1 -= src_y - mask_y; extents.x2 -= src_x - mask_x; extents.y2 -= src_y - mask_y; - if (!analyze_extent (mask, &extents, &mask_flags)) + if (!analyze_extent (mask, &extents, &info.mask_flags)) goto out; /* If the clip is within the source samples, and the samples are * opaque, then the source is effectively opaque. */ #define NEAREST_OPAQUE (FAST_PATH_SAMPLES_OPAQUE | \ FAST_PATH_NEAREST_FILTER | \ FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) #define BILINEAR_OPAQUE (FAST_PATH_SAMPLES_OPAQUE | \ FAST_PATH_BILINEAR_FILTER | \ FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR) - if ((src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE || - (src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE) + if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE || + (info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE) { - src_flags |= FAST_PATH_IS_OPAQUE; + info.src_flags |= FAST_PATH_IS_OPAQUE; } - if ((mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE || - (mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE) + if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE || + (info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE) { - mask_flags |= FAST_PATH_IS_OPAQUE; + info.mask_flags |= FAST_PATH_IS_OPAQUE; } /* * Check if we can replace our operator by a simpler one * if the src or dest are opaque. The output operator should be * mathematically equivalent to the source. */ - op = optimize_operator (op, src_flags, mask_flags, dest_flags); + info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags); - if (_pixman_implementation_lookup_composite ( - get_implementation (), op, - src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags, - &imp, &func)) - { - pixman_composite_info_t info; - const pixman_box32_t *pbox; - int n; + _pixman_implementation_lookup_composite ( + get_implementation (), info.op, + src_format, info.src_flags, + mask_format, info.mask_flags, + dest_format, info.dest_flags, + &imp, &func); + + info.src_image = src; + info.mask_image = mask; + info.dest_image = dest; + + pbox = pixman_region32_rectangles (®ion, &n); - info.op = op; - info.src_image = src; - info.mask_image = mask; - info.dest_image = dest; - info.src_flags = src_flags; - info.mask_flags = mask_flags; - info.dest_flags = dest_flags; - - pbox = pixman_region32_rectangles (®ion, &n); + while (n--) + { + info.src_x = pbox->x1 + src_x - dest_x; + info.src_y = pbox->y1 + src_y - dest_y; + info.mask_x = pbox->x1 + mask_x - dest_x; + info.mask_y = pbox->y1 + mask_y - dest_y; + info.dest_x = pbox->x1; + info.dest_y = pbox->y1; + info.width = pbox->x2 - pbox->x1; + info.height = pbox->y2 - pbox->y1; - while (n--) - { - info.src_x = pbox->x1 + src_x - dest_x; - info.src_y = pbox->y1 + src_y - dest_y; - info.mask_x = pbox->x1 + mask_x - dest_x; - info.mask_y = pbox->y1 + mask_y - dest_y; - info.dest_x = pbox->x1; - info.dest_y = pbox->y1; - info.width = pbox->x2 - pbox->x1; - info.height = pbox->y2 - pbox->y1; + func (imp, &info); - func (imp, &info); - - pbox++; - } + pbox++; } out: pixman_region32_fini (®ion); } PIXMAN_EXPORT void pixman_image_composite (pixman_op_t op, @@ -753,20 +756,20 @@ pixman_blt (uint32_t *src_bits, PIXMAN_EXPORT pixman_bool_t pixman_fill (uint32_t *bits, int stride, int bpp, int x, int y, int width, int height, - uint32_t xor) + uint32_t filler) { return _pixman_implementation_fill ( - get_implementation(), bits, stride, bpp, x, y, width, height, xor); + get_implementation(), bits, stride, bpp, x, y, width, height, filler); } static uint32_t color_to_uint32 (const pixman_color_t *color) { return (color->alpha >> 8 << 24) | (color->red >> 8 << 16) | @@ -815,17 +818,17 @@ color_to_pixel (const pixman_color_t *co c = ((c & 0xff000000) >> 24) | (c << 8); if (format == PIXMAN_a1) c = c >> 31; else if (format == PIXMAN_a8) c = c >> 24; else if (format == PIXMAN_r5g6b5 || format == PIXMAN_b5g6r5) - c = CONVERT_8888_TO_0565 (c); + c = convert_8888_to_0565 (c); #if 0 printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue); printf ("pixel: %x\n", c); #endif *pixel = c; return TRUE;
--- a/gfx/cairo/libpixman/src/pixman.h +++ b/gfx/cairo/libpixman/src/pixman.h @@ -223,16 +223,19 @@ pixman_bool_t pixman_transform_is_identi pixman_bool_t pixman_transform_is_scale (const struct pixman_transform *t); pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t); pixman_bool_t pixman_transform_is_inverse (const struct pixman_transform *a, const struct pixman_transform *b); /* * Floating point matrices */ +typedef struct pixman_f_transform pixman_f_transform_t; +typedef struct pixman_f_vector pixman_f_vector_t; + struct pixman_f_vector { double v[3]; }; struct pixman_f_transform { double m[3][3]; @@ -286,17 +289,38 @@ typedef enum typedef enum { PIXMAN_FILTER_FAST, PIXMAN_FILTER_GOOD, PIXMAN_FILTER_BEST, PIXMAN_FILTER_NEAREST, PIXMAN_FILTER_BILINEAR, - PIXMAN_FILTER_CONVOLUTION + PIXMAN_FILTER_CONVOLUTION, + + /* The SEPARABLE_CONVOLUTION filter takes the following parameters: + * + * width: integer given as 16.16 fixpoint number + * height: integer given as 16.16 fixpoint number + * x_phase_bits: integer given as 16.16 fixpoint + * y_phase_bits: integer given as 16.16 fixpoint + * xtables: (1 << x_phase_bits) tables of size width + * ytables: (1 << y_phase_bits) tables of size height + * + * When sampling at (x, y), the location is first rounded to one of + * n_x_phases * n_y_phases subpixel positions. These subpixel positions + * determine an xtable and a ytable to use. + * + * Conceptually a width x height matrix is then formed in which each entry + * is the product of the corresponding entries in the x and y tables. + * This matrix is then aligned with the image pixels such that its center + * is as close as possible to the subpixel location chosen earlier. Then + * the image is convolved with the matrix and the resulting pixel returned. + */ + PIXMAN_FILTER_SEPARABLE_CONVOLUTION } pixman_filter_t; typedef enum { PIXMAN_OP_CLEAR = 0x00, PIXMAN_OP_SRC = 0x01, PIXMAN_OP_DST = 0x02, PIXMAN_OP_OVER = 0x03, @@ -804,16 +828,43 @@ void pixman_image_set_accessors (p void pixman_image_set_indexed (pixman_image_t *image, const pixman_indexed_t *indexed); uint32_t *pixman_image_get_data (pixman_image_t *image); int pixman_image_get_width (pixman_image_t *image); int pixman_image_get_height (pixman_image_t *image); int pixman_image_get_stride (pixman_image_t *image); /* in bytes */ int pixman_image_get_depth (pixman_image_t *image); pixman_format_code_t pixman_image_get_format (pixman_image_t *image); + +typedef enum +{ + PIXMAN_KERNEL_IMPULSE, + PIXMAN_KERNEL_BOX, + PIXMAN_KERNEL_LINEAR, + PIXMAN_KERNEL_CUBIC, + PIXMAN_KERNEL_GAUSSIAN, + PIXMAN_KERNEL_LANCZOS2, + PIXMAN_KERNEL_LANCZOS3, + PIXMAN_KERNEL_LANCZOS3_STRETCHED /* Jim Blinn's 'nice' filter */ +} pixman_kernel_t; + +/* Create the parameter list for a SEPARABLE_CONVOLUTION filter + * with the given kernels and scale parameters. + */ +pixman_fixed_t * +pixman_filter_create_separable_convolution (int *n_values, + pixman_fixed_t scale_x, + pixman_fixed_t scale_y, + pixman_kernel_t reconstruct_x, + pixman_kernel_t reconstruct_y, + pixman_kernel_t sample_x, + pixman_kernel_t sample_y, + int subsample_bits_x, + int subsample_bits_y); + pixman_bool_t pixman_image_fill_rectangles (pixman_op_t op, pixman_image_t *image, const pixman_color_t *color, int n_rects, const pixman_rectangle16_t *rects); pixman_bool_t pixman_image_fill_boxes (pixman_op_t op, pixman_image_t *dest, const pixman_color_t *color,