Bug 1689998 - Update pixman sources to release 0.40.0. r=jrmuizel
☠☠ backed out by 444011347958 ☠ ☠
authorJonathan Kew <jkew@mozilla.com>
Tue, 02 Feb 2021 14:19:18 +0000
changeset 565599 07da9c6f39dde2f09e37c786a38f4c64081ba757
parent 565598 06c045f7240a7f168e904e1e278e08cf1d3ed928
child 565600 5e707628cd5d60f6341840542a9b5f2eac9c7aa1
push id38164
push usermalexandru@mozilla.com
push dateTue, 02 Feb 2021 21:48:09 +0000
treeherdermozilla-central@5ff587e0026f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs1689998
milestone87.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1689998 - Update pixman sources to release 0.40.0. r=jrmuizel Differential Revision: https://phabricator.services.mozilla.com/D103684
gfx/cairo/libpixman/src/dither/blue-noise-64x64.h
gfx/cairo/libpixman/src/dither/make-blue-noise.c
gfx/cairo/libpixman/src/loongson-mmintrin.h
gfx/cairo/libpixman/src/make-combine.pl
gfx/cairo/libpixman/src/meson.build
gfx/cairo/libpixman/src/pixman-access.c
gfx/cairo/libpixman/src/pixman-arm-asm.h
gfx/cairo/libpixman/src/pixman-arm-common.h
gfx/cairo/libpixman/src/pixman-arm-detect-win32.asm
gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
gfx/cairo/libpixman/src/pixman-arm-neon.c
gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
gfx/cairo/libpixman/src/pixman-arm-simd.c
gfx/cairo/libpixman/src/pixman-arm.c
gfx/cairo/libpixman/src/pixman-bits-image.c
gfx/cairo/libpixman/src/pixman-combine-float.c
gfx/cairo/libpixman/src/pixman-combine.c.template
gfx/cairo/libpixman/src/pixman-combine.h.template
gfx/cairo/libpixman/src/pixman-combine16.c
gfx/cairo/libpixman/src/pixman-combine32.c
gfx/cairo/libpixman/src/pixman-combine32.h
gfx/cairo/libpixman/src/pixman-combine64.c
gfx/cairo/libpixman/src/pixman-combine64.h
gfx/cairo/libpixman/src/pixman-compiler.h
gfx/cairo/libpixman/src/pixman-conical-gradient.c
gfx/cairo/libpixman/src/pixman-cpu.c
gfx/cairo/libpixman/src/pixman-dither.h
gfx/cairo/libpixman/src/pixman-edge-imp.h
gfx/cairo/libpixman/src/pixman-fast-path.c
gfx/cairo/libpixman/src/pixman-fast-path.h
gfx/cairo/libpixman/src/pixman-filter.c
gfx/cairo/libpixman/src/pixman-general.c
gfx/cairo/libpixman/src/pixman-glyph.c
gfx/cairo/libpixman/src/pixman-gradient-walker.c
gfx/cairo/libpixman/src/pixman-image.c
gfx/cairo/libpixman/src/pixman-implementation.c
gfx/cairo/libpixman/src/pixman-inlines.h
gfx/cairo/libpixman/src/pixman-linear-gradient.c
gfx/cairo/libpixman/src/pixman-matrix.c
gfx/cairo/libpixman/src/pixman-mips-dspr2-asm.S
gfx/cairo/libpixman/src/pixman-mips-dspr2-asm.h
gfx/cairo/libpixman/src/pixman-mips-dspr2.c
gfx/cairo/libpixman/src/pixman-mips-dspr2.h
gfx/cairo/libpixman/src/pixman-mmx.c
gfx/cairo/libpixman/src/pixman-noop.c
gfx/cairo/libpixman/src/pixman-private.h
gfx/cairo/libpixman/src/pixman-radial-gradient.c
gfx/cairo/libpixman/src/pixman-region.c
gfx/cairo/libpixman/src/pixman-solid-fill.c
gfx/cairo/libpixman/src/pixman-sse2.c
gfx/cairo/libpixman/src/pixman-ssse3.c
gfx/cairo/libpixman/src/pixman-utils.c
gfx/cairo/libpixman/src/pixman-version.h
gfx/cairo/libpixman/src/pixman-version.h.in
gfx/cairo/libpixman/src/pixman-vmx.c
gfx/cairo/libpixman/src/pixman-x64-mmx-emulation.h
gfx/cairo/libpixman/src/pixman-x86.c
gfx/cairo/libpixman/src/pixman.c
gfx/cairo/libpixman/src/pixman.h
gfx/cairo/libpixman/src/refactor
gfx/cairo/libpixman/src/solaris-hwcap.mapfile
new file mode 100644
--- /dev/null
+++ b/gfx/cairo/libpixman/src/dither/blue-noise-64x64.h
@@ -0,0 +1,77 @@
+/* WARNING: This file is generated by make-blue-noise.c
+ * Please edit that file instead of this one.
+ */
+
+#ifndef BLUE_NOISE_64X64_H
+#define BLUE_NOISE_64X64_H
+
+#include <stdint.h>
+
+static const uint16_t dither_blue_noise_64x64[4096] = {
+    3039, 1368, 3169, 103, 2211, 1248, 2981, 668, 2633, 37, 3963, 2903, 384, 2564, 3115, 1973, 3348, 830, 2505, 1293, 3054, 1060, 1505, 3268, 400, 1341, 593, 3802, 3384, 429, 4082, 1411, 2503, 3863, 126, 1292, 1887, 2855, 205, 2094, 2977, 1899, 3924, 356, 3088, 2500, 3942, 1409, 2293, 1734, 3732, 1291, 3227, 277, 2054, 786, 2871, 411, 2425, 1678, 3986, 455, 2879, 2288,
+    388, 1972, 3851, 778, 2768, 3697, 944, 2123, 1501, 3533, 937, 1713, 1381, 3888, 156, 1242, 516, 2888, 1607, 3676, 632, 2397, 3804, 2673, 1898, 3534, 2593, 1777, 1170, 2299, 3013, 1838, 523, 3053, 1647, 3601, 3197, 959, 1520, 3633, 893, 2437, 3367, 2187, 1258, 137, 1965, 401, 3546, 643, 3087, 2498, 733, 2786, 3371, 4053, 1266, 1977, 3663, 183, 2570, 2107, 1183, 3708,
+    907, 2473, 1151, 3363, 1527, 1902, 232, 3903, 3060, 496, 2486, 3206, 2165, 861, 2387, 3653, 2101, 3972, 132, 2162, 3437, 1827, 215, 895, 3114, 271, 969, 2932, 197, 1598, 878, 3696, 1140, 2120, 904, 2431, 302, 3846, 2675, 481, 3187, 66, 1440, 650, 3833, 2826, 3435, 901, 2936, 2111, 250, 1875, 3609, 1174, 1747, 162, 2346, 3420, 913, 3172, 1383, 752, 3298, 1735,
+    3540, 2938, 249, 2324, 526, 3099, 2561, 1324, 2347, 1861, 1200, 3702, 257, 3442, 1514, 2999, 992, 1766, 2735, 1163, 478, 2943, 1279, 3635, 2177, 1464, 3672, 2386, 3871, 3340, 2690, 64, 3489, 2811, 3999, 633, 1948, 1243, 2269, 1807, 1143, 2750, 3729, 1790, 2363, 1053, 1537, 2636, 4065, 1076, 1476, 3869, 450, 2200, 2676, 658, 2979, 1548, 544, 1913, 2838, 3911, 116, 2698,
+    517, 1295, 3997, 1739, 3665, 1083, 3509, 599, 3400, 118, 2956, 720, 2689, 1907, 567, 2523, 284, 3397, 711, 3219, 2450, 3985, 1665, 2549, 562, 3011, 1855, 729, 1355, 528, 1908, 2456, 1384, 337, 1540, 2654, 3138, 3513, 703, 4080, 3314, 2047, 855, 3037, 209, 3317, 577, 1828, 17, 2336, 3193, 2748, 962, 3441, 1450, 3246, 1075, 3878, 2615, 3497, 1033, 2310, 1442, 2183,
+    1654, 3254, 2061, 738, 2832, 148, 2030, 1670, 909, 3850, 2109, 1533, 4046, 1085, 3098, 3897, 1378, 2248, 3829, 1495, 1966, 23, 797, 3427, 1124, 4057, 95, 2787, 2190, 3074, 3950, 742, 3194, 1999, 3386, 1113, 16, 1657, 2804, 201, 1543, 383, 2559, 1325, 3604, 2068, 2493, 3771, 1284, 3460, 710, 1716, 2447, 80, 3811, 2032, 347, 2227, 15, 1689, 397, 3084, 662, 3798,
+    973, 43, 2608, 3143, 1459, 2423, 4066, 2770, 3191, 1283, 2630, 314, 3235, 2289, 72, 1822, 2840, 924, 350, 2653, 1057, 3715, 2235, 2775, 346, 2083, 1553, 3292, 1081, 274, 1686, 1188, 2327, 3743, 578, 2234, 3916, 2519, 1011, 3056, 2207, 3438, 3890, 537, 1617, 837, 3094, 373, 2795, 1980, 276, 3951, 1353, 3015, 844, 1724, 3651, 2923, 1316, 4092, 2504, 3627, 1936, 2854,
+    2461, 3929, 1193, 421, 3746, 820, 1180, 286, 2261, 532, 3625, 1812, 802, 1327, 3527, 670, 3730, 2025, 3124, 3565, 529, 2960, 1769, 1390, 3196, 2494, 3756, 796, 3618, 2602, 3463, 2847, 166, 953, 1745, 2900, 438, 2070, 1418, 3741, 639, 1205, 1891, 2882, 2282, 4012, 1182, 1696, 3630, 951, 2904, 2170, 3530, 375, 2320, 2742, 1132, 701, 3216, 2023, 847, 1230, 310, 3431,
+    770, 1961, 3531, 1702, 2181, 3370, 1877, 3072, 1571, 3389, 1071, 2415, 3782, 2803, 1610, 2454, 1211, 182, 1655, 2322, 1282, 3372, 287, 3935, 704, 1232, 415, 1910, 2286, 1399, 556, 1964, 4068, 2444, 3605, 1272, 3345, 816, 3526, 256, 2402, 2777, 955, 345, 3289, 111, 2727, 635, 2396, 1488, 3331, 600, 1032, 1575, 4026, 515, 3507, 2433, 1605, 460, 3364, 2783, 1810, 1397,
+    2334, 223, 2945, 688, 2533, 99, 2705, 624, 3944, 2073, 46, 2978, 508, 2132, 269, 3173, 3453, 2631, 4076, 694, 1892, 2586, 972, 2178, 3470, 1695, 2849, 3141, 77, 3884, 994, 3029, 1536, 673, 3083, 124, 2583, 1722, 2821, 1944, 4027, 1661, 3176, 3728, 1337, 1813, 3503, 2035, 3930, 157, 2537, 1865, 3096, 2646, 1941, 3252, 1449, 135, 2836, 3758, 2139, 84, 3678, 3106,
+    3862, 1545, 3307, 1320, 3955, 1031, 3664, 1306, 2460, 776, 1487, 3294, 1187, 3990, 1903, 1021, 549, 1484, 943, 3027, 97, 3853, 1499, 2880, 198, 2575, 3995, 1089, 1587, 2475, 3282, 339, 2657, 1158, 2105, 1493, 3943, 580, 3232, 1287, 846, 48, 2480, 2112, 771, 2534, 459, 3134, 850, 1298, 3790, 325, 3652, 1249, 193, 940, 2202, 3895, 1829, 911, 1366, 2577, 1069, 534,
+    2104, 1009, 2667, 392, 1983, 2917, 1645, 324, 3439, 2869, 3705, 1767, 2592, 756, 2916, 3683, 2276, 2850, 2053, 3594, 2403, 3181, 634, 3699, 1933, 906, 519, 2150, 3673, 764, 1770, 2220, 3795, 3336, 502, 3547, 2339, 1110, 301, 2210, 3354, 3643, 569, 1518, 2940, 3973, 1138, 1613, 2773, 2127, 2983, 1671, 769, 2161, 3800, 2730, 3127, 1179, 533, 3259, 2284, 4014, 1651, 2820,
+    3566, 653, 1839, 3455, 2399, 789, 3149, 2244, 1863, 1099, 474, 2307, 158, 3541, 1312, 1711, 0, 3902, 360, 1629, 1091, 395, 1781, 1191, 2374, 3353, 1419, 3225, 206, 2931, 3553, 1046, 54, 1646, 2470, 910, 1860, 3137, 3770, 2635, 1562, 2809, 1215, 3788, 222, 2199, 3335, 67, 3606, 524, 1001, 3309, 2410, 3473, 591, 1619, 291, 2502, 3629, 2891, 335, 741, 3378, 168,
+    2384, 3129, 4051, 22, 1444, 3613, 543, 3893, 186, 2665, 4062, 933, 3058, 2142, 449, 2711, 3224, 849, 1330, 3349, 2195, 2670, 3484, 2993, 32, 3774, 2722, 1859, 2548, 1268, 583, 2027, 3165, 2807, 4029, 227, 2897, 1434, 721, 1816, 195, 905, 2066, 3258, 1754, 970, 2674, 1880, 2338, 3915, 1485, 2660, 14, 1313, 2914, 2046, 4074, 791, 1917, 1301, 1725, 2687, 2019, 1443,
+    418, 1186, 1664, 2859, 1049, 2056, 2741, 1226, 1589, 3186, 2042, 1377, 3449, 1574, 3941, 1063, 1930, 2501, 3751, 2930, 671, 4031, 888, 2081, 1544, 684, 1117, 351, 4052, 1698, 2393, 3881, 1439, 785, 1277, 2013, 3488, 441, 2459, 3980, 3061, 3481, 2543, 419, 3020, 609, 3515, 1350, 799, 2878, 348, 2034, 3966, 1824, 950, 3281, 1394, 2239, 3452, 55, 3922, 3119, 892, 3785,
+    3023, 2140, 782, 2492, 3817, 241, 3355, 2424, 856, 3639, 612, 2556, 245, 2858, 705, 2316, 3562, 495, 1748, 128, 1912, 1454, 280, 2552, 3905, 3130, 2274, 3472, 834, 3055, 240, 2692, 471, 2272, 3301, 2632, 1080, 3693, 2136, 1029, 1364, 590, 1611, 4067, 1190, 2360, 3827, 261, 3180, 1768, 3471, 1103, 3003, 520, 3674, 151, 2571, 555, 3033, 982, 2353, 504, 1259, 2555,
+    149, 3889, 3380, 493, 3178, 1681, 663, 1924, 2990, 49, 1792, 3861, 1192, 1987, 3273, 297, 1457, 3043, 1177, 2292, 3249, 2829, 3682, 1154, 1758, 428, 2872, 1993, 1500, 3703, 1129, 3421, 1840, 3754, 163, 659, 1733, 3182, 38, 2875, 1957, 3614, 2237, 78, 1873, 2801, 1513, 2121, 1074, 2516, 667, 3710, 1429, 2430, 2088, 2830, 1072, 3557, 1531, 2733, 1955, 3286, 3590, 1826,
+    2778, 1068, 1932, 1452, 2279, 1185, 3564, 3952, 1391, 2726, 3313, 2331, 870, 3709, 1674, 2772, 4085, 808, 2596, 3848, 927, 538, 2335, 3334, 773, 3597, 1347, 109, 2663, 608, 2108, 2994, 936, 1524, 2922, 3968, 2422, 1467, 845, 3870, 321, 2704, 1073, 3308, 3680, 823, 430, 3375, 4030, 112, 2171, 2695, 267, 3374, 731, 1627, 3919, 1871, 352, 3839, 1370, 234, 794, 1532,
+    3245, 647, 3575, 74, 3045, 2766, 285, 2174, 498, 1059, 1551, 385, 3125, 2598, 143, 1128, 2095, 3395, 318, 1590, 3524, 1345, 1969, 242, 2759, 2092, 947, 3926, 3244, 2356, 1658, 6, 3593, 2554, 1172, 1995, 371, 2755, 3417, 2294, 1570, 3164, 748, 2517, 1401, 3111, 2420, 1662, 2910, 1276, 3276, 854, 1804, 4000, 1253, 2987, 229, 2344, 3184, 649, 2196, 2921, 4095, 2389,
+    1289, 2193, 2579, 4023, 757, 1858, 986, 3199, 2514, 3475, 4021, 2154, 651, 1432, 3468, 2404, 574, 1799, 3105, 2145, 86, 2614, 3218, 1565, 4088, 2481, 3079, 1815, 323, 1212, 3837, 759, 2159, 435, 3223, 784, 3659, 1114, 1888, 550, 1221, 3786, 1803, 499, 2117, 185, 3763, 942, 589, 2001, 3838, 1483, 3154, 2256, 468, 2544, 3403, 898, 1208, 2610, 3622, 967, 1929, 378,
+    3781, 220, 1656, 1115, 3347, 2428, 3822, 1577, 712, 1959, 110, 2765, 1762, 3854, 979, 2928, 3714, 1371, 746, 3969, 2884, 975, 3779, 641, 1142, 159, 1460, 702, 3485, 2866, 2495, 3330, 1305, 3937, 1635, 2229, 2962, 146, 4055, 3091, 2417, 100, 3508, 2933, 4006, 1167, 1920, 2760, 3552, 2545, 433, 2845, 142, 1056, 1886, 3616, 1435, 2099, 3803, 1749, 27, 1446, 3350, 2843,
+    884, 3310, 2948, 2103, 447, 1351, 187, 2895, 3655, 1256, 3036, 932, 3325, 2257, 451, 1915, 40, 2780, 2438, 1112, 1814, 423, 2290, 1905, 2898, 3419, 2306, 3760, 1938, 486, 1019, 1791, 3010, 2628, 203, 3408, 1269, 2507, 1606, 862, 2779, 2078, 952, 1529, 2638, 708, 3332, 1413, 2, 1726, 1156, 3500, 2392, 3791, 3076, 812, 107, 2861, 501, 3050, 3487, 2455, 594, 1731,
+    2685, 1498, 680, 3908, 2621, 3529, 1786, 2236, 342, 2569, 1526, 3722, 230, 1290, 3203, 3947, 1609, 3516, 467, 3267, 3685, 1461, 3140, 3569, 367, 1759, 928, 2754, 1332, 2219, 4034, 260, 655, 1984, 978, 3814, 617, 2086, 3525, 279, 3841, 1373, 3361, 319, 2251, 3066, 407, 2382, 3918, 3133, 2168, 762, 1523, 507, 2641, 1677, 4025, 2413, 1584, 793, 2049, 1109, 3962, 2218,
+    1194, 3692, 266, 1687, 981, 3103, 740, 3983, 1005, 3434, 570, 2383, 1942, 2718, 676, 2462, 1007, 2089, 1308, 2222, 233, 2568, 829, 1241, 2669, 3987, 514, 3303, 69, 3142, 1603, 3560, 2295, 3288, 1497, 2696, 1764, 2865, 1058, 3271, 1914, 477, 2529, 3927, 1736, 1273, 3752, 2029, 1012, 565, 2798, 4078, 1949, 3305, 1175, 2179, 380, 3366, 1195, 3849, 2637, 416, 2959, 125,
+    3396, 2467, 2036, 3234, 2340, 68, 2819, 1436, 2011, 3139, 1704, 4073, 860, 3582, 1468, 2969, 211, 3157, 4056, 866, 2935, 2000, 3923, 31, 2157, 1477, 2429, 1147, 3792, 2557, 774, 2802, 1153, 3747, 464, 3192, 42, 3904, 539, 1474, 2283, 803, 2876, 1061, 75, 3477, 747, 2893, 1538, 3626, 251, 1322, 2506, 189, 2791, 3667, 939, 2991, 1971, 175, 3195, 1416, 3648, 1857,
+    3052, 454, 851, 3789, 1271, 1906, 3694, 2484, 406, 2757, 26, 1189, 2909, 296, 2215, 3784, 1864, 637, 2715, 1673, 3445, 581, 1572, 3059, 3469, 761, 2984, 1737, 2058, 440, 1414, 1921, 121, 2527, 894, 2223, 1302, 2377, 3077, 2666, 3759, 3198, 1811, 3661, 2166, 2731, 1883, 359, 3285, 2458, 1805, 3459, 926, 3834, 675, 1893, 1496, 2612, 657, 3523, 1763, 2354, 564, 961,
+    1367, 3977, 1588, 2714, 322, 3446, 1088, 625, 3887, 1354, 3535, 2090, 3316, 1760, 1127, 483, 3491, 1421, 2301, 94, 1202, 3740, 2311, 1014, 1878, 3836, 180, 3412, 991, 2868, 3953, 3450, 3081, 1632, 4071, 1882, 3543, 726, 1719, 179, 1171, 364, 1420, 622, 3090, 1490, 946, 4007, 2212, 1102, 619, 2739, 2189, 1669, 2937, 3426, 39, 3940, 2191, 1264, 887, 4091, 2792, 2135,
+    4, 2883, 2281, 631, 3044, 1641, 2232, 3243, 1773, 2319, 827, 2591, 629, 3938, 2426, 3222, 2629, 1044, 3879, 3293, 1952, 2749, 275, 2590, 472, 1372, 2496, 660, 3669, 2264, 208, 915, 2167, 561, 2828, 307, 3265, 1104, 3964, 2155, 3425, 1951, 4077, 2391, 283, 3387, 2581, 115, 1415, 3069, 3896, 141, 3158, 1214, 442, 2405, 1349, 3085, 425, 2528, 3002, 312, 1602, 3588,
+    1137, 3323, 1963, 1002, 3578, 2521, 127, 925, 2970, 273, 3737, 1573, 167, 2863, 1509, 800, 147, 2059, 2942, 409, 921, 3151, 1451, 3909, 3333, 2844, 2096, 1512, 3136, 1210, 1798, 2709, 1331, 3586, 1034, 1521, 2441, 2926, 488, 2585, 775, 3031, 2693, 879, 3602, 1173, 2028, 3654, 2781, 841, 1975, 1507, 3646, 768, 3991, 2012, 996, 3544, 1666, 3810, 1990, 3360, 753, 2597,
+    3736, 304, 1473, 3828, 485, 1334, 4008, 2072, 3495, 1136, 2806, 2004, 3236, 1010, 2130, 3819, 1750, 3567, 644, 2515, 1794, 3636, 698, 2137, 1162, 832, 3761, 326, 2613, 513, 3302, 3820, 357, 3163, 2259, 3733, 101, 1922, 1386, 3587, 1640, 28, 1286, 2141, 1761, 2918, 693, 1639, 457, 3250, 2434, 365, 2599, 1729, 3284, 2643, 306, 2793, 689, 1090, 104, 1309, 2305, 1831,
+    2776, 859, 2446, 2915, 1778, 3337, 2677, 614, 1508, 2409, 469, 4033, 1321, 3563, 402, 3131, 2720, 1093, 1569, 4042, 1229, 2277, 216, 3046, 1817, 57, 3006, 1684, 4059, 2016, 795, 2440, 1652, 1960, 610, 2763, 920, 3864, 3110, 1026, 2326, 3762, 3233, 521, 3856, 173, 2457, 3939, 2138, 1262, 3572, 989, 3021, 2238, 119, 1445, 3832, 1809, 2297, 3467, 2700, 3684, 3102, 394,
+    4036, 2050, 3256, 89, 2198, 1079, 248, 1845, 3805, 3104, 880, 1779, 2688, 717, 2373, 1375, 262, 2249, 3071, 13, 2813, 3429, 1600, 3984, 2416, 3603, 1299, 2298, 998, 3492, 1393, 2951, 10, 4009, 1247, 3462, 1679, 2204, 414, 2736, 316, 1894, 2816, 1050, 3373, 1462, 3107, 817, 3464, 21, 1835, 4070, 568, 1178, 3718, 875, 3168, 466, 2974, 1458, 2084, 616, 1564, 1018,
+    1693, 546, 1244, 3899, 716, 3160, 3608, 2877, 1220, 334, 3443, 2270, 44, 3000, 1843, 3928, 3405, 766, 3686, 2040, 587, 993, 2647, 387, 930, 2753, 630, 3274, 150, 2808, 453, 3638, 1092, 2352, 3030, 239, 2562, 700, 3240, 1257, 4016, 730, 1515, 2203, 2551, 417, 1866, 1123, 2348, 2902, 1550, 2678, 2075, 3238, 1630, 2531, 2115, 1255, 4054, 840, 290, 3874, 2477, 3399,
+    2250, 3577, 2817, 1626, 2576, 1356, 2315, 792, 2087, 2618, 1612, 3855, 1263, 3637, 1036, 494, 1535, 2553, 1198, 1715, 3867, 3170, 1359, 1954, 3483, 1539, 2069, 3886, 1772, 2487, 1534, 2045, 3242, 806, 1578, 2018, 3948, 1423, 3596, 2076, 2466, 3424, 139, 3688, 871, 4049, 2852, 3342, 547, 3719, 327, 852, 3505, 207, 2794, 542, 3600, 45, 2411, 3324, 1788, 3012, 1235, 61,
+    2655, 917, 253, 1986, 3738, 313, 1706, 4072, 120, 3229, 957, 597, 2024, 3262, 2453, 2857, 2002, 3190, 210, 2784, 2206, 300, 2400, 3766, 553, 3152, 218, 1150, 2988, 883, 3753, 627, 2664, 3831, 437, 3385, 1008, 2957, 60, 1636, 891, 2899, 1776, 3062, 1315, 2026, 194, 1643, 2079, 1296, 3201, 2465, 1379, 1927, 3898, 1125, 1847, 2846, 1552, 1028, 2725, 2169, 787, 3202,
+    1441, 3982, 3032, 1052, 3251, 605, 2639, 3073, 1431, 3642, 2329, 2949, 341, 1634, 833, 129, 4020, 916, 3571, 669, 1506, 3411, 821, 2856, 1207, 2337, 2683, 3448, 340, 2214, 3128, 235, 1738, 1288, 2833, 2419, 606, 1884, 2668, 552, 3765, 1176, 399, 2302, 596, 3591, 2634, 767, 3845, 2767, 995, 3967, 491, 3057, 814, 2300, 3422, 691, 3797, 254, 3645, 509, 3478, 1836,
+    2119, 475, 2445, 1525, 2175, 3539, 914, 1926, 473, 1157, 1800, 3971, 2701, 3739, 2129, 3486, 1333, 1784, 2366, 2982, 1070, 4089, 1802, 73, 1642, 3958, 835, 1837, 1480, 4043, 1217, 2469, 3416, 2113, 88, 3668, 1240, 3255, 3920, 2355, 3167, 2003, 2645, 3936, 3228, 1592, 1144, 3474, 2394, 79, 1820, 2241, 1594, 3656, 2584, 153, 1448, 3034, 2005, 2511, 1692, 1335, 3913, 217,
+    2822, 3391, 745, 3813, 192, 1274, 2941, 3847, 2489, 3440, 744, 161, 1422, 1086, 572, 3004, 2617, 338, 3807, 2031, 236, 2472, 3065, 2098, 3358, 362, 2163, 3574, 497, 2788, 1970, 948, 3885, 685, 3100, 1712, 2228, 292, 1408, 1016, 164, 3537, 1417, 941, 34, 2172, 3001, 358, 1491, 3147, 699, 3356, 258, 1149, 2946, 1787, 3931, 382, 1146, 3291, 818, 2890, 2379, 1096,
+    3679, 1328, 1901, 3162, 2747, 1730, 2253, 5, 1556, 2818, 2093, 3166, 2522, 3410, 2287, 1701, 956, 3237, 620, 1596, 3300, 1307, 511, 3701, 1020, 2939, 1362, 2532, 3208, 749, 3641, 160, 1522, 2624, 1095, 4086, 826, 2841, 3583, 2173, 1727, 723, 2925, 1911, 2482, 3726, 863, 1962, 4028, 1111, 2835, 3773, 2449, 2022, 582, 3278, 923, 2619, 2152, 4039, 92, 1934, 3145, 677,
+    2530, 53, 2303, 1003, 458, 3989, 739, 3321, 1064, 369, 3556, 877, 1900, 426, 3876, 1, 3617, 2106, 1197, 2805, 3634, 857, 2706, 1504, 2418, 682, 3868, 20, 1139, 1688, 2333, 3311, 2907, 1945, 265, 2385, 3433, 1601, 636, 2620, 3095, 4044, 386, 3382, 1184, 527, 2814, 3414, 2342, 465, 1889, 1343, 874, 3479, 1502, 2233, 3689, 1385, 559, 2745, 1463, 3465, 376, 1718,
+    3217, 4045, 1580, 3612, 2525, 1228, 3018, 1958, 3725, 2358, 1361, 3996, 1581, 3063, 1224, 2737, 1475, 2442, 3946, 191, 1796, 2128, 3975, 134, 1916, 3318, 1597, 2071, 3749, 2672, 403, 1278, 602, 3745, 3220, 1374, 445, 2064, 3830, 243, 1252, 2390, 1563, 2724, 3875, 1818, 1346, 165, 1650, 3264, 2680, 117, 2998, 4081, 343, 2799, 9, 3122, 1743, 3724, 1040, 2231, 3842, 1209,
+    900, 398, 2851, 697, 1797, 3482, 293, 2679, 1649, 566, 2954, 91, 2697, 714, 2060, 3211, 781, 480, 3040, 1038, 2611, 666, 2989, 3458, 1201, 2796, 548, 2975, 839, 3121, 1850, 4001, 2208, 1631, 790, 2558, 2972, 1148, 3213, 1849, 3624, 971, 2102, 108, 772, 3101, 2589, 3777, 1042, 656, 3907, 2097, 1615, 2540, 805, 1935, 1231, 3494, 2451, 268, 2995, 750, 2682, 2020,
+    3024, 1392, 2124, 3279, 106, 2217, 1387, 822, 3214, 3825, 2160, 1000, 2395, 3691, 228, 4038, 1872, 3413, 1608, 2225, 3536, 303, 1653, 886, 2541, 224, 4037, 2252, 1428, 172, 3504, 958, 2848, 113, 3628, 1834, 3979, 19, 2317, 779, 2797, 518, 3174, 3549, 1482, 2266, 444, 2014, 3555, 2439, 1213, 3113, 535, 1135, 3204, 3858, 2309, 931, 623, 2009, 3359, 1566, 140, 3550,
+    1808, 3872, 2488, 1152, 3764, 2892, 3960, 2412, 353, 1223, 1825, 3444, 3116, 1717, 1082, 2313, 1280, 2661, 82, 3852, 1389, 3200, 2330, 3812, 2038, 3581, 1728, 1039, 3339, 2427, 586, 2580, 1238, 3328, 2280, 1047, 595, 2662, 1363, 3338, 1620, 3934, 2497, 1881, 1054, 3954, 3215, 864, 2887, 1801, 320, 3519, 2378, 3704, 1753, 424, 2958, 1660, 4005, 2601, 1116, 3912, 2381, 573,
+    2740, 200, 828, 1667, 432, 1931, 1035, 1616, 3598, 2640, 728, 264, 1437, 557, 3501, 2966, 372, 3734, 974, 1978, 758, 2719, 1145, 452, 1433, 725, 2681, 408, 3843, 1918, 1547, 3906, 1996, 503, 1456, 3019, 3493, 1700, 3742, 355, 2134, 176, 1311, 615, 2867, 315, 1680, 1314, 8, 3297, 1494, 783, 1950, 83, 2656, 1382, 3561, 138, 2834, 1404, 330, 1904, 3156, 1027,
+    1357, 3381, 3041, 3666, 2729, 734, 3415, 177, 3051, 2021, 4079, 2823, 3775, 2186, 2616, 869, 1668, 3148, 2367, 3315, 393, 4075, 1870, 2920, 3343, 2362, 3188, 1303, 2782, 825, 3171, 259, 2905, 3717, 2538, 184, 2074, 838, 2860, 2407, 1024, 3496, 3008, 3706, 1985, 2349, 3623, 2582, 4058, 2184, 2694, 3873, 2964, 990, 3346, 690, 2033, 1066, 2201, 3490, 2971, 718, 3700, 2188,
+    4061, 391, 1989, 2325, 1430, 3150, 2125, 2526, 592, 1403, 976, 2351, 1165, 1851, 114, 3921, 2063, 613, 1358, 2785, 1623, 2254, 25, 3542, 1045, 246, 1852, 3554, 87, 2243, 3615, 1169, 727, 1705, 968, 3957, 3185, 1251, 500, 4063, 1751, 2622, 842, 1519, 90, 3393, 819, 490, 1874, 999, 571, 1275, 2271, 1586, 4040, 2448, 3126, 3731, 436, 885, 1708, 2421, 24, 1599,
+    889, 2563, 1199, 645, 70, 4013, 1237, 3723, 1694, 3499, 3, 3266, 484, 2997, 3390, 1233, 2842, 3687, 152, 3480, 1084, 3698, 881, 2490, 1542, 3992, 2209, 692, 1690, 3022, 1470, 2625, 2114, 3512, 2359, 381, 2684, 1897, 3368, 1395, 3080, 289, 2065, 3981, 2758, 1141, 3097, 1472, 2870, 3352, 3707, 225, 3159, 505, 1895, 214, 1222, 1774, 2686, 3978, 3275, 1196, 3518, 2825,
+    3270, 1720, 3796, 3466, 2650, 1841, 298, 899, 2862, 2091, 2671, 1744, 3735, 801, 1560, 349, 2262, 903, 1833, 2524, 512, 3117, 1793, 2827, 476, 3038, 1216, 2550, 3826, 980, 431, 4048, 35, 2992, 1265, 1595, 765, 3675, 76, 2247, 696, 3456, 1254, 2452, 664, 1757, 2133, 3750, 145, 2332, 1554, 1981, 3580, 2712, 868, 3640, 2919, 638, 2275, 1427, 309, 2595, 2006, 492,
+    2226, 178, 2911, 836, 1528, 3028, 2240, 3327, 404, 3970, 707, 1294, 2464, 2131, 4032, 2600, 3319, 1406, 2913, 3974, 2156, 1425, 221, 3877, 2017, 811, 3662, 272, 3287, 1988, 2408, 3357, 1746, 598, 3239, 3823, 2182, 2934, 1078, 2604, 3840, 1697, 2906, 413, 3210, 3880, 331, 2644, 1260, 848, 3042, 2535, 1077, 1438, 3261, 2365, 1561, 3799, 85, 3082, 1876, 674, 3932, 1101,
+    3644, 1344, 1943, 2401, 390, 3835, 1048, 2572, 1541, 1133, 3075, 3584, 308, 2889, 1065, 1869, 601, 3783, 282, 1181, 736, 3312, 2368, 1126, 3383, 1675, 2734, 1426, 628, 2873, 1317, 843, 2717, 2048, 1004, 2536, 333, 1782, 3295, 1517, 219, 2153, 815, 3502, 1579, 2268, 987, 3409, 1780, 4018, 354, 665, 3914, 47, 1956, 456, 1006, 2010, 3406, 1130, 3621, 2894, 1549, 3092,
+    2485, 640, 3993, 3179, 1270, 3436, 585, 1925, 3757, 2304, 136, 1976, 1486, 646, 3520, 50, 3155, 1637, 2435, 3522, 1937, 2756, 3748, 661, 2224, 58, 3230, 2357, 1830, 3892, 170, 3607, 1447, 3949, 190, 3392, 1336, 584, 4010, 918, 3016, 3670, 1155, 2406, 52, 1304, 3009, 607, 2085, 2699, 3205, 1848, 2291, 3402, 2764, 3865, 3048, 2508, 735, 2710, 443, 2341, 897, 263,
+    1785, 2769, 983, 56, 2197, 1685, 2703, 202, 2944, 810, 3377, 2626, 3787, 3047, 2055, 1236, 2752, 2122, 945, 3093, 96, 1624, 439, 3014, 1388, 4015, 977, 448, 3506, 1098, 2242, 3026, 506, 2361, 2952, 1862, 3619, 2790, 1992, 2483, 525, 1868, 2652, 4093, 1998, 3595, 2478, 3816, 122, 1412, 929, 3716, 1166, 1648, 813, 1300, 199, 1489, 3998, 1771, 1310, 3808, 2052, 3423,
+    434, 3712, 1625, 3558, 2955, 853, 4019, 1348, 3511, 1732, 1246, 487, 934, 1672, 2510, 3965, 788, 3711, 396, 1369, 4090, 1055, 2603, 1879, 3528, 2518, 2067, 3005, 1516, 2588, 751, 1740, 3418, 1131, 1576, 686, 2296, 1118, 18, 3263, 1365, 3401, 294, 737, 3177, 410, 867, 1633, 2963, 3579, 2375, 252, 2881, 479, 2471, 3576, 2180, 3306, 332, 2255, 3035, 41, 2648, 1396,
+    2929, 2230, 1219, 2512, 446, 2008, 3189, 2388, 626, 2164, 2831, 4047, 2376, 174, 3272, 368, 1469, 3226, 2578, 1991, 2874, 2263, 3681, 876, 188, 1239, 683, 3776, 226, 3183, 4083, 2148, 63, 2649, 3859, 299, 3086, 3933, 1585, 2185, 3767, 988, 1707, 2908, 1407, 1844, 2771, 2245, 1161, 560, 1755, 3376, 2051, 4064, 3135, 1832, 652, 2853, 1051, 3649, 760, 3290, 1105, 3945,
+    872, 154, 3207, 713, 3780, 1453, 281, 1087, 3695, 30, 3299, 1919, 1400, 3551, 1119, 1890, 2314, 618, 1703, 3428, 724, 295, 3146, 1557, 3341, 2896, 1683, 2723, 1974, 1017, 541, 1380, 3720, 804, 3280, 2082, 997, 2567, 777, 2961, 213, 2707, 2328, 3632, 1025, 3891, 3304, 255, 4003, 3108, 2587, 1323, 743, 1479, 105, 1013, 3901, 1618, 2044, 2627, 1465, 1846, 576, 1994,
+    2560, 3521, 1742, 2118, 2800, 3404, 1783, 2609, 2968, 1582, 1022, 412, 2713, 687, 2976, 3857, 2761, 3620, 62, 1108, 3844, 1340, 2100, 540, 2345, 3925, 405, 3457, 1319, 2468, 3362, 2815, 1867, 2372, 1281, 1714, 3690, 482, 3498, 1842, 1285, 3994, 558, 2039, 81, 2499, 678, 1481, 1923, 964, 12, 3824, 2980, 2205, 2762, 3432, 2398, 181, 3247, 462, 4094, 2350, 3589, 3089,
+    1555, 1094, 4041, 247, 1267, 908, 3959, 2041, 732, 3860, 2343, 3132, 3769, 2144, 1621, 237, 912, 1329, 3025, 2146, 2642, 1775, 3721, 2746, 1121, 1953, 902, 2285, 130, 3671, 1659, 278, 3153, 522, 2721, 123, 2996, 1466, 2380, 377, 3231, 873, 1510, 3476, 3123, 1250, 2147, 3650, 2839, 3451, 2323, 1122, 3545, 379, 1765, 1218, 603, 3768, 1360, 938, 2885, 133, 1245, 363,
+    2364, 554, 2743, 3344, 2474, 530, 3112, 169, 1297, 3430, 536, 1741, 98, 1043, 2574, 3253, 2246, 1854, 4022, 510, 3283, 204, 858, 3398, 36, 3118, 1478, 3794, 2986, 706, 2176, 922, 3559, 1097, 3976, 3322, 2149, 1160, 2810, 3883, 2007, 2513, 2953, 328, 1721, 3793, 422, 2566, 807, 329, 1638, 1967, 648, 2520, 3727, 3109, 2116, 2927, 2491, 1939, 3365, 1709, 2728, 3815,
+    2037, 3120, 831, 1405, 1896, 3592, 1622, 2369, 2864, 2151, 1107, 2542, 3532, 1410, 3917, 427, 3568, 709, 2509, 1503, 1037, 2973, 2436, 1604, 4035, 2594, 563, 1819, 2659, 1234, 4004, 2565, 1511, 2273, 1823, 336, 882, 3772, 575, 1628, 171, 3570, 1120, 2260, 2716, 935, 3064, 1806, 1342, 3144, 3900, 2744, 3296, 985, 1546, 238, 896, 1663, 305, 3660, 695, 2213, 960, 3407,
+    144, 1795, 3894, 2267, 51, 2708, 1023, 3818, 366, 1821, 4087, 2985, 755, 2057, 2912, 949, 1583, 2774, 231, 3447, 2258, 3866, 1982, 672, 1225, 2077, 3320, 1062, 370, 3241, 1968, 7, 3068, 681, 3631, 2573, 1567, 3175, 2321, 1067, 3070, 722, 1856, 3744, 642, 1471, 4084, 131, 3514, 2443, 531, 1227, 155, 2265, 4024, 2658, 3326, 3910, 1168, 3078, 1530, 3956, 489, 1424,
+    3647, 1203, 420, 2924, 3755, 719, 3248, 1376, 3067, 890, 196, 1559, 3269, 270, 2432, 1885, 3212, 1164, 3778, 1752, 579, 1338, 344, 3585, 3017, 288, 3658, 2371, 3882, 1691, 611, 2789, 3809, 1339, 389, 2950, 2015, 59, 3548, 2751, 2158, 4011, 1352, 29, 3388, 2370, 2812, 1946, 954, 2110, 1558, 2947, 3573, 1909, 1326, 679, 1853, 2312, 551, 2702, 33, 2414, 3209, 2824,
+    2547, 2143, 3379, 966, 1492, 1979, 2479, 463, 2194, 3657, 2738, 2318, 1261, 3713, 604, 4002, 11, 2192, 2967, 919, 2607, 3369, 2837, 1676, 2539, 984, 1568, 93, 2901, 1318, 3538, 1041, 2216, 1756, 3454, 1030, 4050, 1402, 798, 1723, 311, 3277, 2546, 2886, 2043, 461, 1206, 3677, 361, 3260, 3988, 809, 2605, 470, 3007, 3517, 102, 3221, 1398, 2062, 3611, 1134, 1928, 865,
+    4060, 621, 1710, 2606, 3510, 317, 4017, 1682, 3329, 1159, 1940, 654, 3461, 1789, 1015, 2691, 1455, 3599, 374, 1947, 4069, 71, 2126, 763, 3961, 2278, 3161, 1997, 824, 2623, 2080, 244, 3257, 780, 2732, 2308, 545, 3351, 2476, 3806, 1204, 588, 1591, 963, 3610, 1699, 754, 3049, 2651, 1106, 65, 2221, 1644, 3821, 1100, 2463, 1614, 3801, 965, 2965, 715, 3394, 1593, 212,
+};
+
+#endif /* BLUE_NOISE_64X64_H */
new file mode 100644
--- /dev/null
+++ b/gfx/cairo/libpixman/src/dither/make-blue-noise.c
@@ -0,0 +1,679 @@
+/* Blue noise generation using the void-and-cluster method as described in
+ *
+ *     The void-and-cluster method for dither array generation
+ *     Ulichney, Robert A (1993)
+ *
+ *     http://cv.ulichney.com/papers/1993-void-cluster.pdf
+ *
+ * Note that running with openmp (-DUSE_OPENMP) will trigger additional
+ * randomness due to computing reductions in parallel, and is not recommended
+ * unless generating very large dither arrays.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <stdio.h>
+
+/* Booleans and utility functions */
+
+#ifndef TRUE
+#   define TRUE 1
+#endif
+
+#ifndef FALSE
+#   define FALSE 0
+#endif
+
+typedef int bool_t;
+
+int
+imin (int x, int y)
+{
+    return x < y ? x : y;
+}
+
+/* Memory allocation */
+void *
+malloc_abc (unsigned int a, unsigned int b, unsigned int c)
+{
+    if (a >= INT32_MAX / b)
+	return NULL;
+    else if (a * b >= INT32_MAX / c)
+	return NULL;
+    else
+	return malloc (a * b * c);
+}
+
+/* Random number generation */
+typedef uint32_t xorwow_state_t[5];
+
+uint32_t
+xorwow_next (xorwow_state_t *state)
+{
+    uint32_t s = (*state)[0],
+    t = (*state)[3];
+    (*state)[3] = (*state)[2];
+    (*state)[2] = (*state)[1];
+    (*state)[1] = s;
+
+    t ^= t >> 2;
+    t ^= t << 1;
+    t ^= s ^ (s << 4);
+
+    (*state)[0] = t;
+    (*state)[4] += 362437;
+
+    return t + (*state)[4];
+}
+
+float
+xorwow_float (xorwow_state_t *s)
+{
+    return (xorwow_next (s) >> 9) / (float)((1 << 23) - 1);
+}
+
+/* Floating point matrices
+ *
+ * Used to cache the cluster sizes.
+ */
+typedef struct matrix_t {
+    int width;
+    int height;
+    float *buffer;
+} matrix_t;
+
+bool_t
+matrix_init (matrix_t *matrix, int width, int height)
+{
+    float *buffer;
+
+    if (!matrix)
+	return FALSE;
+
+    buffer = malloc_abc (width, height, sizeof (float));
+
+    if (!buffer)
+	return FALSE;
+
+    matrix->buffer = buffer;
+    matrix->width  = width;
+    matrix->height = height;
+
+    return TRUE;
+}
+
+bool_t
+matrix_copy (matrix_t *dst, matrix_t const *src)
+{
+    float *srcbuf = src->buffer,
+	  *srcend = src->buffer + src->width * src->height,
+	  *dstbuf = dst->buffer;
+
+    if (dst->width != src->width || dst->height != src->height)
+	return FALSE;
+
+    while (srcbuf < srcend)
+	*dstbuf++ = *srcbuf++;
+
+    return TRUE;
+}
+
+float *
+matrix_get (matrix_t *matrix, int x, int y)
+{
+    return &matrix->buffer[y * matrix->width + x];
+}
+
+void
+matrix_destroy (matrix_t *matrix)
+{
+    free (matrix->buffer);
+}
+
+/* Binary patterns */
+typedef struct pattern_t {
+    int width;
+    int height;
+    bool_t *buffer;
+} pattern_t;
+
+bool_t
+pattern_init (pattern_t *pattern, int width, int height)
+{
+    bool_t *buffer;
+
+    if (!pattern)
+	return FALSE;
+
+    buffer = malloc_abc (width, height, sizeof (bool_t));
+
+    if (!buffer)
+	return FALSE;
+
+    pattern->buffer = buffer;
+    pattern->width  = width;
+    pattern->height = height;
+
+    return TRUE;
+}
+
+bool_t
+pattern_copy (pattern_t *dst, pattern_t const *src)
+{
+    bool_t *srcbuf = src->buffer,
+	   *srcend = src->buffer + src->width * src->height,
+	   *dstbuf = dst->buffer;
+
+    if (dst->width != src->width || dst->height != src->height)
+	return FALSE;
+
+    while (srcbuf < srcend)
+	*dstbuf++ = *srcbuf++;
+
+    return TRUE;
+}
+
+bool_t *
+pattern_get (pattern_t *pattern, int x, int y)
+{
+    return &pattern->buffer[y * pattern->width + x];
+}
+
+void
+pattern_fill_white_noise (pattern_t *pattern, float fraction,
+			  xorwow_state_t *s)
+{
+    bool_t *buffer = pattern->buffer;
+    bool_t *end    = buffer + (pattern->width * pattern->height);
+
+    while (buffer < end)
+	*buffer++ = xorwow_float (s) < fraction;
+}
+
+void
+pattern_destroy (pattern_t *pattern)
+{
+    free (pattern->buffer);
+}
+
+/* Dither arrays */
+typedef struct array_t {
+    int width;
+    int height;
+    uint32_t *buffer;
+} array_t;
+
+bool_t
+array_init (array_t *array, int width, int height)
+{
+    uint32_t *buffer;
+
+    if (!array)
+	return FALSE;
+
+    buffer = malloc_abc (width, height, sizeof (uint32_t));
+
+    if (!buffer)
+	return FALSE;
+
+    array->buffer = buffer;
+    array->width  = width;
+    array->height = height;
+
+    return TRUE;
+}
+
+uint32_t *
+array_get (array_t *array, int x, int y)
+{
+    return &array->buffer[y * array->width + x];
+}
+
+bool_t
+array_save_ppm (array_t *array, const char *filename)
+{
+    FILE *f = fopen(filename, "wb");
+
+    int i   = 0;
+    int bpp = 2;
+    uint8_t buffer[1024];
+
+    if (!f)
+	return FALSE;
+
+    if (array->width * array->height - 1 < 256)
+	bpp = 1;
+
+    fprintf(f, "P5 %d %d %d\n", array->width, array->height,
+	    array->width * array->height - 1);
+    while (i < array->width * array->height)
+    {
+	    int j = 0;
+	    for (; j < 1024 / bpp && j < array->width * array->height; ++j)
+	    {
+		    uint32_t v = array->buffer[i + j];
+		    if (bpp == 2)
+		    {
+			buffer[2 * j] = v & 0xff;
+			buffer[2 * j + 1] = (v & 0xff00) >> 8;
+		    } else {
+			buffer[j] = v;
+		    }
+	    }
+
+	    fwrite((void *)buffer, bpp, j, f);
+	    i += j;
+    }
+
+    if (fclose(f) != 0)
+	return FALSE;
+
+    return TRUE;
+}
+
+bool_t
+array_save (array_t *array, const char *filename)
+{
+    int x, y;
+    FILE *f = fopen(filename, "wb");
+
+    if (!f)
+	return FALSE;
+
+    fprintf (f, 
+"/* WARNING: This file is generated by make-blue-noise.c\n"
+" * Please edit that file instead of this one.\n"
+" */\n"
+"\n"
+"#ifndef BLUE_NOISE_%dX%d_H\n"
+"#define BLUE_NOISE_%dX%d_H\n"
+"\n"
+"#include <stdint.h>\n"
+"\n", array->width, array->height, array->width, array->height);
+
+    fprintf (f, "static const uint16_t dither_blue_noise_%dx%d[%d] = {\n",
+	     array->width, array->height, array->width * array->height);
+
+    for (y = 0; y < array->height; ++y)
+    {
+	fprintf (f, "    ");
+	for (x = 0; x < array->width; ++x)
+	{
+	    if (x != 0)
+		fprintf (f, ", ");
+
+	    fprintf (f, "%d", *array_get (array, x, y));
+	}
+
+	fprintf (f, ",\n");
+    }
+    fprintf (f, "};\n");
+
+    fprintf (f, "\n#endif /* BLUE_NOISE_%dX%d_H */\n",
+	     array->width, array->height);
+
+    if (fclose(f) != 0)
+	return FALSE;
+
+    return TRUE;
+}
+
+void
+array_destroy (array_t *array)
+{
+    free (array->buffer);
+}
+
+/* Dither array generation */
+bool_t
+compute_cluster_sizes (pattern_t *pattern, matrix_t *matrix)
+{
+    int width  = pattern->width,
+	height = pattern->height;
+
+    if (matrix->width != width || matrix->height != height)
+	return FALSE;
+
+    int px, py, qx, qy, dx, dy;
+    float tsqsi = 2.f * 1.5f * 1.5f;
+
+#ifdef USE_OPENMP
+#pragma omp parallel for default (none) \
+    private (py, px, qy, qx, dx, dy) \
+    shared (height, width, pattern, matrix, tsqsi)
+#endif
+    for (py = 0; py < height; ++py)
+    {
+	for (px = 0; px < width; ++px)
+	{
+	    bool_t pixel = *pattern_get (pattern, px, py);
+	    float dist   = 0.f;
+
+	    for (qx = 0; qx < width; ++qx)
+	    {
+		dx = imin (abs (qx - px), width - abs (qx - px));
+		dx = dx * dx;
+
+		for (qy = 0; qy < height; ++qy)
+		{
+		    dy = imin (abs (qy - py), height - abs (qy - py));
+		    dy = dy * dy;
+
+		    dist += (pixel == *pattern_get (pattern, qx, qy))
+			* expf (- (dx + dy) / tsqsi);
+		}
+	    }
+
+	    *matrix_get (matrix, px, py) = dist;
+	}
+    }
+
+    return TRUE;
+}
+
+bool_t
+swap_pixel (pattern_t *pattern, matrix_t *matrix, int x, int y)
+{
+    int width  = pattern->width,
+	height = pattern->height;
+
+    bool_t new;
+
+    float f,
+          dist  = 0.f,
+	  tsqsi = 2.f * 1.5f * 1.5f;
+
+    int px, py, dx, dy;
+    bool_t b;
+
+    new = !*pattern_get (pattern, x, y);
+    *pattern_get (pattern, x, y) = new;
+
+    if (matrix->width != width || matrix->height != height)
+	return FALSE;
+
+
+#ifdef USE_OPENMP
+#pragma omp parallel for reduction (+:dist) default (none) \
+    private (px, py, dx, dy, b, f) \
+    shared (x, y, width, height, pattern, matrix, new, tsqsi)
+#endif
+    for (py = 0; py < height; ++py)
+    {
+	dy = imin (abs (py - y), height - abs (py - y));
+	dy = dy * dy;
+
+	for (px = 0; px < width; ++px)
+	{
+	    dx = imin (abs (px - x), width - abs (px - x));
+	    dx = dx * dx;
+
+	    b = (*pattern_get (pattern, px, py) == new);
+	    f = expf (- (dx + dy) / tsqsi);
+	    *matrix_get (matrix, px, py) += (2 * b - 1) * f;
+
+	    dist += b * f;
+	}
+    }
+
+    *matrix_get (matrix, x, y) = dist;
+    return TRUE;
+}
+
+void
+largest_cluster (pattern_t *pattern, matrix_t *matrix,
+		 bool_t pixel, int *xmax, int *ymax)
+{
+    int width       = pattern->width,
+	height      = pattern->height;
+
+    int   x, y;
+
+    float vmax = -INFINITY;
+
+#ifdef USE_OPENMP
+#pragma omp parallel default (none) \
+    private (x, y) \
+    shared (height, width, pattern, matrix, pixel, xmax, ymax, vmax)
+#endif
+    {
+	int xbest = -1,
+	    ybest = -1;
+
+#ifdef USE_OPENMP
+	float vbest = -INFINITY;
+
+#pragma omp for reduction (max: vmax) collapse (2)
+#endif
+	for (y = 0; y < height; ++y)
+	{
+	    for (x = 0; x < width; ++x)
+	    {
+		if (*pattern_get (pattern, x, y) != pixel)
+		    continue;
+
+		if (*matrix_get (matrix, x, y) > vmax)
+		{
+		    vmax = *matrix_get (matrix, x, y);
+#ifdef USE_OPENMP
+		    vbest = vmax;
+#endif
+		    xbest = x;
+		    ybest = y;
+		}
+	    }
+	}
+
+#ifdef USE_OPENMP
+#pragma omp barrier
+#pragma omp critical
+	{
+	    if (vmax == vbest)
+	    {
+		*xmax = xbest;
+		*ymax = ybest;
+	    }
+	}
+#else
+	*xmax = xbest;
+	*ymax = ybest;
+#endif
+    }
+
+    assert (vmax > -INFINITY);
+}
+
+void
+generate_initial_binary_pattern (pattern_t *pattern, matrix_t *matrix)
+{
+    int xcluster = 0,
+	ycluster = 0,
+	xvoid    = 0,
+	yvoid    = 0;
+
+    for (;;)
+    {
+	largest_cluster (pattern, matrix, TRUE, &xcluster, &ycluster);
+	assert (*pattern_get (pattern, xcluster, ycluster) == TRUE);
+	swap_pixel (pattern, matrix, xcluster, ycluster);
+
+	largest_cluster (pattern, matrix, FALSE, &xvoid, &yvoid);
+	assert (*pattern_get (pattern, xvoid, yvoid) == FALSE);
+	swap_pixel (pattern, matrix, xvoid, yvoid);
+
+	if (xcluster == xvoid && ycluster == yvoid)
+	    return;
+    }
+}
+
+bool_t
+generate_dither_array (array_t *array,
+		       pattern_t const *prototype, matrix_t const *matrix,
+		       pattern_t *temp_pattern, matrix_t *temp_matrix)
+{
+    int width        = prototype->width,
+	height       = prototype->height;
+
+    int x, y, rank;
+
+    int initial_rank = 0;
+
+    if (array->width != width || array->height != height)
+	return FALSE;
+
+    // Make copies of the prototype and associated sizes matrix since we will
+    // trash them
+    if (!pattern_copy (temp_pattern, prototype))
+	return FALSE;
+
+    if (!matrix_copy (temp_matrix, matrix))
+	return FALSE;
+
+    // Compute initial rank
+    for (y = 0; y < height; ++y)
+    {
+	for (x = 0; x < width; ++x)
+	{
+	    if (*pattern_get (temp_pattern, x, y))
+		initial_rank += 1;
+
+	    *array_get (array, x, y) = 0;
+	}
+    }
+
+    // Phase 1
+    for (rank = initial_rank; rank > 0; --rank)
+    {
+	largest_cluster (temp_pattern, temp_matrix, TRUE, &x, &y);
+	swap_pixel (temp_pattern, temp_matrix, x, y);
+	*array_get (array, x, y) = rank - 1;
+    }
+
+    // Make copies again for phases 2 & 3
+    if (!pattern_copy (temp_pattern, prototype))
+	return FALSE;
+
+    if (!matrix_copy (temp_matrix, matrix))
+	return FALSE;
+
+    // Phase 2 & 3
+    for (rank = initial_rank; rank < width * height; ++rank)
+    {
+	largest_cluster (temp_pattern, temp_matrix, FALSE, &x, &y);
+	swap_pixel (temp_pattern, temp_matrix, x, y);
+	*array_get (array, x, y) = rank;
+    }
+
+    return TRUE;
+}
+
+bool_t
+generate (int size, xorwow_state_t *s,
+	  char const *c_filename, char const *ppm_filename)
+{
+    bool_t ok = TRUE;
+
+    pattern_t prototype, temp_pattern;
+    array_t   array;
+    matrix_t  matrix, temp_matrix;
+
+    printf ("Generating %dx%d blue noise...\n", size, size);
+
+    if (!pattern_init (&prototype, size, size))
+	return FALSE;
+
+    if (!pattern_init (&temp_pattern, size, size))
+    {
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    if (!matrix_init (&matrix, size, size))
+    {
+	pattern_destroy (&temp_pattern);
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    if (!matrix_init (&temp_matrix, size, size))
+    {
+	matrix_destroy (&matrix);
+	pattern_destroy (&temp_pattern);
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    if (!array_init (&array, size, size))
+    {
+	matrix_destroy (&temp_matrix);
+	matrix_destroy (&matrix);
+	pattern_destroy (&temp_pattern);
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    printf("Filling initial binary pattern with white noise...\n");
+    pattern_fill_white_noise (&prototype, .1, s);
+
+    printf("Initializing cluster sizes...\n");
+    if (!compute_cluster_sizes (&prototype, &matrix))
+    {
+	fprintf (stderr, "Error while computing cluster sizes\n");
+	ok = FALSE;
+	goto out;
+    }
+
+    printf("Generating initial binary pattern...\n");
+    generate_initial_binary_pattern (&prototype, &matrix);
+
+    printf("Generating dither array...\n");
+    if (!generate_dither_array (&array, &prototype, &matrix,
+			 &temp_pattern, &temp_matrix))
+    {
+	fprintf (stderr, "Error while generating dither array\n");
+	ok = FALSE;
+	goto out;
+    }
+
+    printf("Saving dither array...\n");
+    if (!array_save (&array, c_filename))
+    {
+	fprintf (stderr, "Error saving dither array\n");
+	ok = FALSE;
+	goto out;
+    }
+
+#if SAVE_PPM
+    if (!array_save_ppm (&array, ppm_filename))
+    {
+	fprintf (stderr, "Error saving dither array PPM\n");
+	ok = FALSE;
+	goto out;
+    }
+#else
+    (void)ppm_filename;
+#endif
+
+    printf("All done!\n");
+
+out:
+    array_destroy (&array);
+    matrix_destroy (&temp_matrix);
+    matrix_destroy (&matrix);
+    pattern_destroy (&temp_pattern);
+    pattern_destroy (&prototype);
+    return ok;
+}
+
+int
+main (void)
+{
+    xorwow_state_t s = {1185956906, 12385940, 983948, 349208051, 901842};
+
+    if (!generate (64, &s, "blue-noise-64x64.h", "blue-noise-64x64.ppm"))
+	return -1;
+
+    return 0;
+}
new file mode 100644
--- /dev/null
+++ b/gfx/cairo/libpixman/src/loongson-mmintrin.h
@@ -0,0 +1,412 @@
+/* The gcc-provided loongson intrinsic functions are way too fucking broken
+ * to be of any use, otherwise I'd use them.
+ *
+ * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
+ *   close enough that they could have implemented the _mm_*-style intrinsic
+ *   interface and had a ton of optimized code available to them. Instead they
+ *   implemented something much, much worse.
+ *
+ * - pshuf takes a dead first argument, causing extra instructions to be
+ *   generated.
+ *
+ * - There are no 64-bit shift or logical intrinsics, which means you have
+ *   to implement them with inline assembly, but this is a nightmare because
+ *   gcc doesn't understand that the integer vector datatypes are actually in
+ *   floating-point registers, so you end up with braindead code like
+ *
+ *	punpcklwd	$f9,$f9,$f5
+ *	    dmtc1	v0,$f8
+ *	punpcklwd	$f19,$f19,$f5
+ *	    dmfc1	t9,$f9
+ *	    dmtc1	v0,$f9
+ *	    dmtc1	t9,$f20
+ *	    dmfc1	s0,$f19
+ *	punpcklbh	$f20,$f20,$f2
+ *
+ *   where crap just gets copied back and forth between integer and floating-
+ *   point registers ad nauseum.
+ *
+ * Instead of trying to workaround the problems from these crap intrinsics, I
+ * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
+ * assembly.
+ */
+
+#include <stdint.h>
+
+/* vectors are stored in 64-bit floating-point registers */
+typedef double __m64;
+/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
+typedef float  __m32;
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+	return 0.0;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddush %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddusb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("and %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pcmpeqw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmaddhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmulhuh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmullh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("or %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packushb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packsswh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
+{
+	if (__builtin_constant_p (__w3) &&
+	    __builtin_constant_p (__w2) &&
+	    __builtin_constant_p (__w1) &&
+	    __builtin_constant_p (__w0))
+	{
+		uint64_t val = ((uint64_t)__w3 << 48)
+			     | ((uint64_t)__w2 << 32)
+			     | ((uint64_t)__w1 << 16)
+			     | ((uint64_t)__w0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
+	{
+		/* TODO: handle other cases */
+		uint64_t val = __w3;
+		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	} else {
+		uint64_t val = ((uint64_t)__w3 << 48)
+			     | ((uint64_t)__w2 << 32)
+			     | ((uint64_t)__w1 << 16)
+			     | ((uint64_t)__w0 <<  0);
+		return *(__m64 *)&val;
+	}
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (unsigned __i1, unsigned __i0)
+{
+	if (__builtin_constant_p (__i1) &&
+	    __builtin_constant_p (__i0))
+	{
+		uint64_t val = ((uint64_t)__i1 << 32)
+			     | ((uint64_t)__i0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__i1 == __i0)
+	{
+		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	} else {
+		uint64_t val = ((uint64_t)__i1 << 32)
+			     | ((uint64_t)__i0 <<  0);
+		return *(__m64 *)&val;
+	}
+}
+#undef _MM_SHUFFLE
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __m, int64_t __n)
+{
+	__m64 ret;
+	asm("pshufh %0, %1, %2\n\t"
+	    : "=f" (ret)
+	    : "f" (__m), "f" (*(__m64 *)&__n)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psllh  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsll  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsrl  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("psubh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
+ * allows load8888 to use 32-bit loads */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("xor %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_extract_pi16 (__m64 __m, int64_t __pos)
+{
+	__m64 ret;
+	asm("pextrh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__pos)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
+{
+	__m64 ret;
+	asm("pinsrh_%3 %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2), "i" (__pos)
+	);
+	return ret;
+}
deleted file mode 100644
--- a/gfx/cairo/libpixman/src/make-combine.pl
+++ /dev/null
@@ -1,86 +0,0 @@
-$usage = "Usage: combine.pl { 8 | 16 } < pixman-combine.c.template";
-
-$#ARGV == 0 or die $usage;
-
-# Get the component size.
-$size = int($ARGV[0]);
-$size == 8 or $size == 16 or die $usage;
-
-$pixel_size = $size * 4;
-$half_pixel_size = $size * 2;
-
-sub mask {
-    my $str = shift;
-    my $suffix;
-    $suffix = "ULL" if $size > 8;
-
-    return "0x" . $str . $suffix;
-}
-
-# Generate mask strings.
-$nibbles = $size / 4;
-$mask = "f" x $nibbles;
-$zero_mask = "0" x $nibbles;
-$one_half = "8" . "0" x ($nibbles - 1);
-
-print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
-print "   Please edit one of those files rather than this one. */\n";
-print "\n";
-
-print "#line 1 \"pixman-combine.c.template\"\n";
-
-$mask_ = mask($mask);
-$one_half_ = mask($one_half);
-$g_mask = mask($mask . $zero_mask);
-$b_mask = mask($mask . $zero_mask x 2);
-$a_mask = mask($mask . $zero_mask x 3);
-$rb_mask = mask($mask . $zero_mask . $mask);
-$ag_mask = mask($mask . $zero_mask . $mask . $zero_mask);
-$rb_one_half = mask($one_half . $zero_mask . $one_half);
-$rb_mask_plus_one = mask("1" . $zero_mask x 2 . "1" .  $zero_mask);
-
-while (<STDIN>) {
-    # Mask and 1/2 value for a single component.
-    s/#define COMPONENT_SIZE\b/$& $size/;
-    s/#define MASK\b/$& $mask_/;
-    s/#define ONE_HALF\b/$& $one_half_/;
-
-    # Shifts and masks for green, blue, and alpha.
-    s/#define G_SHIFT\b/$& $size/;
-    s/#define R_SHIFT\b/$& $size * 2/;
-    s/#define A_SHIFT\b/$& $size * 3/;
-    s/#define G_MASK\b/$& $g_mask/;
-    s/#define R_MASK\b/$& $b_mask/;
-    s/#define A_MASK\b/$& $a_mask/;
-
-    # Special values for dealing with red + blue at the same time.
-    s/#define RB_MASK\b/$& $rb_mask/;
-    s/#define AG_MASK\b/$& $ag_mask/;
-    s/#define RB_ONE_HALF\b/$& $rb_one_half/;
-    s/#define RB_MASK_PLUS_ONE\b/$& $rb_mask_plus_one/;
-
-    # Add 32/64 suffix to combining function types.
-    s/\bCombineFunc\b/CombineFunc$pixel_size/;
-    s/\bFbComposeFunctions\b/FbComposeFunctions$pixel_size/;
-    s/combine_width/combine_$pixel_size/;
-    s/_pixman_setup_combiner_functions_width/_pixman_setup_combiner_functions_$pixel_size/;
-    s/UNc/UN$size/g;
-    s/ALPHA_c/ALPHA_$size/g;
-    s/RED_c/RED_$size/g;
-    s/GREEN_c/GREEN_$size/g;
-    s/BLUE_c/BLUE_$size/g;
-
-    # Convert comp*_t values into the appropriate real types.
-    s/comp1_t/uint${size}_t/g;
-    s/comp2_t/uint${half_pixel_size}_t/g;
-    s/comp4_t/uint${pixel_size}_t/g;
-
-    # Change the function table name for the 64-bit version.
-    s/pixman_composeFunctions/pixman_composeFunctions64/ if $size == 16;
-
-    # Change the header for the 64-bit version
-    s/pixman-combine.h/pixman-combine64.h/ if $size == 16;
-    s/pixman-combine.h/pixman-combine32.h/ if $size == 8;
-
-    print;
-}
new file mode 100644
--- /dev/null
+++ b/gfx/cairo/libpixman/src/meson.build
@@ -0,0 +1,129 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+config_h = configure_file(
+  configuration : config,
+  output : 'config.h'
+)
+
+version_h = configure_file(
+  configuration : version_conf,
+  input : 'pixman-version.h.in',
+  output : 'pixman-version.h',
+  install_dir : join_paths(get_option('prefix'), get_option('includedir'), 'pixman-1')
+)
+
+libpixman_extra_cargs = []
+if cc.has_function_attribute('dllexport')
+  libpixman_extra_cargs = ['-DPIXMAN_API=__declspec(dllexport)']
+endif
+
+pixman_simd_libs = []
+simds = [
+  # the mmx library can be compiled with mmx on x86/x86_64, iwmmxt on
+  # some arm cores, or loongson mmi on loongson mips systems. The
+  # libraries will all have the same name, "pixman-mmx", but there is
+  # no chance of more than one version being built in the same build
+  # because no system could have mmx, iwmmxt, and mmi, and it
+  # simplifies the build logic to give them the same name.
+  ['mmx', have_mmx, mmx_flags, []],
+  ['mmx', have_loongson_mmi, loongson_mmi_flags, []],
+  ['mmx', have_iwmmxt, iwmmxt_flags, []],
+
+  ['sse2', have_sse2, sse2_flags, []],
+  ['ssse3', have_ssse3, ssse3_flags, []],
+  ['vmx', have_vmx, vmx_flags, []],
+  ['arm-simd', have_armv6_simd, [],
+   ['pixman-arm-simd-asm.S', 'pixman-arm-simd-asm-scaled.S']],
+  ['arm-neon', have_neon, [],
+   ['pixman-arm-neon-asm.S', 'pixman-arm-neon-asm-bilinear.S']],
+  ['mips-dspr2', have_mips_dspr2, mips_dspr2_flags,
+   ['pixman-mips-dspr2-asm.S', 'pixman-mips-memcpy-asm.S']],
+]
+
+foreach simd : simds
+  if simd[1]
+    name = 'pixman-' + simd[0]
+    pixman_simd_libs += static_library(
+      name,
+      [name + '.c', config_h, version_h, simd[3]],
+      c_args : simd[2]
+    )
+  endif
+endforeach
+
+pixman_files = files(
+  'pixman.c',
+  'pixman-access.c',
+  'pixman-access-accessors.c',
+  'pixman-bits-image.c',
+  'pixman-combine32.c',
+  'pixman-combine-float.c',
+  'pixman-conical-gradient.c',
+  'pixman-filter.c',
+  'pixman-x86.c',
+  'pixman-mips.c',
+  'pixman-arm.c',
+  'pixman-ppc.c',
+  'pixman-edge.c',
+  'pixman-edge-accessors.c',
+  'pixman-fast-path.c',
+  'pixman-glyph.c',
+  'pixman-general.c',
+  'pixman-gradient-walker.c',
+  'pixman-image.c',
+  'pixman-implementation.c',
+  'pixman-linear-gradient.c',
+  'pixman-matrix.c',
+  'pixman-noop.c',
+  'pixman-radial-gradient.c',
+  'pixman-region16.c',
+  'pixman-region32.c',
+  'pixman-solid-fill.c',
+  'pixman-timer.c',
+  'pixman-trap.c',
+  'pixman-utils.c',
+)
+
+# We cannot use 'link_with' or 'link_whole' because meson wont do the right
+# thing for static archives.
+_obs = []
+foreach l : pixman_simd_libs
+  _obs += l.extract_all_objects()
+endforeach
+
+libpixman = library(
+  'pixman-1',
+  [pixman_files, config_h, version_h],
+  objects : _obs,
+  c_args : libpixman_extra_cargs,
+  dependencies : [dep_m, dep_threads],
+  version : meson.project_version(),
+  install : true,
+)
+
+inc_pixman = include_directories('.')
+
+idep_pixman = declare_dependency(
+  link_with: libpixman,
+  include_directories : inc_pixman,
+)
+
+install_headers('pixman.h', subdir : 'pixman-1')
--- a/gfx/cairo/libpixman/src/pixman-access.c
+++ b/gfx/cairo/libpixman/src/pixman-access.c
@@ -63,49 +63,49 @@
     (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
 #else
 #define FETCH_4(img,l,o)						\
     (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
 #endif
 
 #ifdef WORDS_BIGENDIAN
 #define FETCH_24(img,l,o)                                              \
-    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+    ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
 #else
 #define FETCH_24(img,l,o)						\
-    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+    ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
 #endif
 
 /* Store macros */
 
 #ifdef WORDS_BIGENDIAN
 #define STORE_1(img,l,o,v)						\
     do									\
     {									\
 	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
 	uint32_t __m, __v;						\
 									\
-	__m = 1 << (0x1f - ((o) & 0x1f));				\
+	__m = 1U << (0x1f - ((o) & 0x1f));				\
 	__v = (v)? __m : 0;						\
 									\
 	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
     }									\
     while (0)
 #else
 #define STORE_1(img,l,o,v)						\
     do									\
     {									\
 	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
 	uint32_t __m, __v;						\
 									\
-	__m = 1 << ((o) & 0x1f);					\
+	__m = 1U << ((o) & 0x1f);					\
 	__v = (v)? __m : 0;						\
 									\
 	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
     }									\
     while (0)
 #endif
 
 #define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
@@ -289,24 +289,24 @@ convert_pixel (pixman_format_code_t from
     b = convert_channel (pixel, 0,
 			 PIXMAN_FORMAT_B (from), b_from_shift,
 			 PIXMAN_FORMAT_B (to), b_to_shift);
 
     return a | r | g | b;
 }
 
 static force_inline uint32_t
-convert_pixel_to_a8r8g8b8 (pixman_image_t *image,
+convert_pixel_to_a8r8g8b8 (bits_image_t *image,
 			   pixman_format_code_t format,
 			   uint32_t pixel)
 {
     if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY		||
 	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
     {
-	return image->bits.indexed->rgba[pixel];
+	return image->indexed->rgba[pixel];
     }
     else
     {
 	return convert_pixel (format, PIXMAN_a8r8g8b8, pixel);
     }
 }
 
 static force_inline uint32_t
@@ -327,17 +327,17 @@ convert_pixel_from_a8r8g8b8 (pixman_imag
     }
     else
     {
 	return convert_pixel (PIXMAN_a8r8g8b8, format, pixel);
     }
 }
 
 static force_inline uint32_t
-fetch_and_convert_pixel (pixman_image_t	*	image,
+fetch_and_convert_pixel (bits_image_t *		image,
 			 const uint8_t *	bits,
 			 int			offset,
 			 pixman_format_code_t	format)
 {
     uint32_t pixel;
 
     switch (PIXMAN_FORMAT_BPP (format))
     {
@@ -412,25 +412,25 @@ convert_and_store_pixel (bits_image_t *	
     default:
 	*dest = 0x0;
 	break;
     }
 }
 
 #define MAKE_ACCESSORS(format)						\
     static void								\
-    fetch_scanline_ ## format (pixman_image_t *image,			\
+    fetch_scanline_ ## format (bits_image_t *image,			\
 			       int	       x,			\
 			       int             y,			\
 			       int             width,			\
 			       uint32_t *      buffer,			\
 			       const uint32_t *mask)			\
     {									\
 	uint8_t *bits =							\
-	    (uint8_t *)(image->bits.bits + y * image->bits.rowstride);	\
+	    (uint8_t *)(image->bits + y * image->rowstride);		\
 	int i;								\
 									\
 	for (i = 0; i < width; ++i)					\
 	{								\
 	    *buffer++ =							\
 		fetch_and_convert_pixel (image, bits, x + i, PIXMAN_ ## format); \
 	}								\
     }									\
@@ -456,18 +456,18 @@ convert_and_store_pixel (bits_image_t *	
     static uint32_t							\
     fetch_pixel_ ## format (bits_image_t *image,			\
 			    int		offset,				\
 			    int		line)				\
     {									\
 	uint8_t *bits =							\
 	    (uint8_t *)(image->bits + line * image->rowstride);		\
 									\
-	return fetch_and_convert_pixel ((pixman_image_t *)image,	\
-					bits, offset, PIXMAN_ ## format); \
+	return fetch_and_convert_pixel (				\
+	    image, bits, offset, PIXMAN_ ## format);			\
     }									\
 									\
     static const void *const __dummy__ ## format
 
 MAKE_ACCESSORS(a8r8g8b8);
 MAKE_ACCESSORS(x8r8g8b8);
 MAKE_ACCESSORS(a8b8g8r8);
 MAKE_ACCESSORS(x8b8g8r8);
@@ -578,24 +578,24 @@ to_srgb (float f)
 
     if (to_linear[high] - f < f - to_linear[low])
 	return high;
     else
 	return low;
 }
 
 static void
-fetch_scanline_a8r8g8b8_sRGB_float (pixman_image_t *image,
+fetch_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
 				    int             x,
 				    int             y,
 				    int             width,
 				    uint32_t *      b,
 				    const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
 
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
 	argb_t *argb = buffer;
@@ -607,24 +607,24 @@ fetch_scanline_a8r8g8b8_sRGB_float (pixm
 	argb->b = to_linear [(p >>  0) & 0xff];
 
 	buffer++;
     }
 }
 
 /* Expects a float buffer */
 static void
-fetch_scanline_a2r10g10b10_float (pixman_image_t *image,
+fetch_scanline_a2r10g10b10_float (bits_image_t *  image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
 
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
 	uint64_t a = p >> 30;
@@ -637,25 +637,67 @@ fetch_scanline_a2r10g10b10_float (pixman
 	buffer->g = pixman_unorm_to_float (g, 10);
 	buffer->b = pixman_unorm_to_float (b, 10);
 
 	buffer++;
     }
 }
 
 /* Expects a float buffer */
+#ifndef PIXMAN_FB_ACCESSORS
 static void
-fetch_scanline_x2r10g10b10_float (pixman_image_t *image,
+fetch_scanline_rgbf_float (bits_image_t   *image,
+			   int             x,
+			   int             y,
+			   int             width,
+			   uint32_t *      b,
+			   const uint32_t *mask)
+{
+    const float *bits = (float *)image->bits + y * image->rowstride;
+    const float *pixel = bits + x * 3;
+    argb_t *buffer = (argb_t *)b;
+
+    for (; width--; buffer++) {
+	buffer->r = *pixel++;
+	buffer->g = *pixel++;
+	buffer->b = *pixel++;
+	buffer->a = 1.f;
+    }
+}
+
+static void
+fetch_scanline_rgbaf_float (bits_image_t   *image,
+			    int             x,
+			    int             y,
+			    int             width,
+			    uint32_t *      b,
+			    const uint32_t *mask)
+{
+    const float *bits = (float *)image->bits + y * image->rowstride;
+    const float *pixel = bits + x * 4;
+    argb_t *buffer = (argb_t *)b;
+
+    for (; width--; buffer++) {
+	buffer->r = *pixel++;
+	buffer->g = *pixel++;
+	buffer->b = *pixel++;
+	buffer->a = *pixel++;
+    }
+}
+#endif
+
+static void
+fetch_scanline_x2r10g10b10_float (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
 
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
 	uint64_t r = (p >> 20) & 0x3ff;
@@ -668,24 +710,24 @@ fetch_scanline_x2r10g10b10_float (pixman
 	buffer->b = pixman_unorm_to_float (b, 10);
 
 	buffer++;
     }
 }
 
 /* Expects a float buffer */
 static void
-fetch_scanline_a2b10g10r10_float (pixman_image_t *image,
+fetch_scanline_a2b10g10r10_float (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
 
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
 	uint64_t a = p >> 30;
@@ -699,24 +741,24 @@ fetch_scanline_a2b10g10r10_float (pixman
 	buffer->b = pixman_unorm_to_float (b, 10);
 
 	buffer++;
     }
 }
 
 /* Expects a float buffer */
 static void
-fetch_scanline_x2b10g10r10_float (pixman_image_t *image,
+fetch_scanline_x2b10g10r10_float (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
 
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
 	uint64_t b = (p >> 20) & 0x3ff;
@@ -728,24 +770,24 @@ fetch_scanline_x2b10g10r10_float (pixman
 	buffer->g = pixman_unorm_to_float (g, 10);
 	buffer->b = pixman_unorm_to_float (b, 10);
 
 	buffer++;
     }
 }
 
 static void
-fetch_scanline_yuy2 (pixman_image_t *image,
+fetch_scanline_yuy2 (bits_image_t   *image,
                      int             x,
                      int             line,
                      int             width,
                      uint32_t *      buffer,
                      const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
+    const uint32_t *bits = image->bits + image->rowstride * line;
     int i;
     
     for (i = 0; i < width; i++)
     {
 	int16_t y, u, v;
 	int32_t r, g, b;
 	
 	y = ((uint8_t *) bits)[(x + i) << 1] - 16;
@@ -762,17 +804,17 @@ fetch_scanline_yuy2 (pixman_image_t *ima
 	*buffer++ = 0xff000000 |
 	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
 	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
 	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
     }
 }
 
 static void
-fetch_scanline_yv12 (pixman_image_t *image,
+fetch_scanline_yv12 (bits_image_t   *image,
                      int             x,
                      int             line,
                      int             width,
                      uint32_t *      buffer,
                      const uint32_t *mask)
 {
     YV12_SETUP (image);
     uint8_t *y_line = YV12_Y (line);
@@ -800,16 +842,50 @@ fetch_scanline_yv12 (pixman_image_t *ima
 	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
 	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
 	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
     }
 }
 
 /**************************** Pixel wise fetching *****************************/
 
+#ifndef PIXMAN_FB_ACCESSORS
+static argb_t
+fetch_pixel_rgbf_float (bits_image_t *image,
+			int	    offset,
+			int	    line)
+{
+    float *bits = (float *)image->bits + line * image->rowstride;
+    argb_t argb;
+
+    argb.r = bits[offset * 3];
+    argb.g = bits[offset * 3 + 1];
+    argb.b = bits[offset * 3 + 2];
+    argb.a = 1.f;
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_rgbaf_float (bits_image_t *image,
+			 int	    offset,
+			 int	    line)
+{
+    float *bits = (float *)image->bits + line * image->rowstride;
+    argb_t argb;
+
+    argb.r = bits[offset * 4];
+    argb.g = bits[offset * 4 + 1];
+    argb.b = bits[offset * 4 + 2];
+    argb.a = bits[offset * 4 + 3];
+
+    return argb;
+}
+#endif
+
 static argb_t
 fetch_pixel_x2r10g10b10_float (bits_image_t *image,
 			       int	   offset,
 			       int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t p = READ (image, bits + offset);
     uint64_t r = (p >> 20) & 0x3ff;
@@ -957,31 +1033,70 @@ fetch_pixel_yv12 (bits_image_t *image,
     return 0xff000000 |
 	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
 	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
 	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
 }
 
 /*********************************** Store ************************************/
 
+#ifndef PIXMAN_FB_ACCESSORS
+static void
+store_scanline_rgbaf_float (bits_image_t *  image,
+			    int             x,
+			    int             y,
+			    int             width,
+			    const uint32_t *v)
+{
+    float *bits = (float *)image->bits + image->rowstride * y + 4 * x;
+    const argb_t *values = (argb_t *)v;
+
+    for (; width; width--, values++)
+    {
+	*bits++ = values->r;
+	*bits++ = values->g;
+	*bits++ = values->b;
+	*bits++ = values->a;
+    }
+}
+
+static void
+store_scanline_rgbf_float (bits_image_t *  image,
+			   int             x,
+			   int             y,
+			   int             width,
+			   const uint32_t *v)
+{
+    float *bits = (float *)image->bits + image->rowstride * y + 3 * x;
+    const argb_t *values = (argb_t *)v;
+
+    for (; width; width--, values++)
+    {
+	*bits++ = values->r;
+	*bits++ = values->g;
+	*bits++ = values->b;
+    }
+}
+#endif
+
 static void
 store_scanline_a2r10g10b10_float (bits_image_t *  image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  const uint32_t *v)
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
     argb_t *values = (argb_t *)v;
     int i;
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = pixman_float_to_unorm (values[i].a, 2);
 	r = pixman_float_to_unorm (values[i].r, 10);
 	g = pixman_float_to_unorm (values[i].g, 10);
 	b = pixman_float_to_unorm (values[i].b, 10);
 
 	WRITE (image, pixel++,
 	       (a << 30) | (r << 20) | (g << 10) | b);
@@ -997,17 +1112,17 @@ store_scanline_x2r10g10b10_float (bits_i
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
     argb_t *values = (argb_t *)v;
     int i;
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t r, g, b;
+	uint32_t r, g, b;
 
 	r = pixman_float_to_unorm (values[i].r, 10);
 	g = pixman_float_to_unorm (values[i].g, 10);
 	b = pixman_float_to_unorm (values[i].b, 10);
 
 	WRITE (image, pixel++,
 	       (r << 20) | (g << 10) | b);
     }
@@ -1022,17 +1137,17 @@ store_scanline_a2b10g10r10_float (bits_i
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
     argb_t *values = (argb_t *)v;
     int i;
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = pixman_float_to_unorm (values[i].a, 2);
 	r = pixman_float_to_unorm (values[i].r, 10);
 	g = pixman_float_to_unorm (values[i].g, 10);
 	b = pixman_float_to_unorm (values[i].b, 10);
 
 	WRITE (image, pixel++,
 	       (a << 30) | (b << 20) | (g << 10) | r);
@@ -1048,17 +1163,17 @@ store_scanline_x2b10g10r10_float (bits_i
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
     argb_t *values = (argb_t *)v;
     int i;
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t r, g, b;
+	uint32_t r, g, b;
 
 	r = pixman_float_to_unorm (values[i].r, 10);
 	g = pixman_float_to_unorm (values[i].g, 10);
 	b = pixman_float_to_unorm (values[i].b, 10);
 
 	WRITE (image, pixel++,
 	       (b << 20) | (g << 10) | r);
     }
@@ -1073,66 +1188,28 @@ store_scanline_a8r8g8b8_sRGB_float (bits
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
     argb_t *values = (argb_t *)v;
     int i;
 
     for (i = 0; i < width; ++i)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = pixman_float_to_unorm (values[i].a, 8);
 	r = to_srgb (values[i].r);
 	g = to_srgb (values[i].g);
 	b = to_srgb (values[i].b);
 
 	WRITE (image, pixel++,
 	       (a << 24) | (r << 16) | (g << 8) | b);
     }
 }
 
-static void
-store_scanline_16 (bits_image_t *  image,
-		   int             x,
-		   int             y,
-		   int             width,
-		   const uint32_t *v)
-{
-    uint16_t *bits = (uint16_t*)(image->bits + image->rowstride * y);
-    uint16_t *values = (uint16_t *)v;
-    uint16_t *pixel = bits + x;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++, values[i]);
-    }
-}
-
-static void
-fetch_scanline_16 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint16_t *bits = (uint16_t*)(image->bits.bits + y * image->bits.rowstride);
-    const uint16_t *pixel = bits + x;
-    int i;
-    uint16_t *buffer = (uint16_t *)b;
-
-    for (i = 0; i < width; ++i)
-    {
-	*buffer++ = READ (image, pixel++);
-    }
-}
-
-
 /*
  * Contracts a floating point image to 32bpp and then stores it using a
  * regular 32-bit store proc. Despite the type, this function expects an
  * argb_t buffer.
  */
 static void
 store_scanline_generic_float (bits_image_t *  image,
 			      int             x,
@@ -1154,47 +1231,47 @@ store_scanline_generic_float (bits_image
     pixman_contract_from_float (argb8_pixels, (argb_t *)values, width);
 
     image->store_scanline_32 (image, x, y, width, argb8_pixels);
 
     free (argb8_pixels);
 }
 
 static void
-fetch_scanline_generic_float (pixman_image_t *image,
+fetch_scanline_generic_float (bits_image_t *  image,
 			      int	      x,
 			      int	      y,
 			      int	      width,
 			      uint32_t *      buffer,
 			      const uint32_t *mask)
 {
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
+    image->fetch_scanline_32 (image, x, y, width, buffer, NULL);
 
-    pixman_expand_to_float ((argb_t *)buffer, buffer, image->bits.format, width);
+    pixman_expand_to_float ((argb_t *)buffer, buffer, image->format, width);
 }
 
 /* The 32_sRGB paths should be deleted after narrow processing
  * is no longer invoked for formats that are considered wide.
  * (Also see fetch_pixel_generic_lossy_32) */
 static void
-fetch_scanline_a8r8g8b8_32_sRGB (pixman_image_t *image,
+fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
                                  int             x,
                                  int             y,
                                  int             width,
                                  uint32_t       *buffer,
                                  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
     uint32_t tmp;
     
     while (pixel < end)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	tmp = READ (image, pixel++);
 
 	a = (tmp >> 24) & 0xff;
 	r = (tmp >> 16) & 0xff;
 	g = (tmp >> 8) & 0xff;
 	b = (tmp >> 0) & 0xff;
 
@@ -1208,17 +1285,17 @@ fetch_scanline_a8r8g8b8_32_sRGB (pixman_
 
 static uint32_t
 fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
 			      int           offset,
 			      int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t tmp = READ (image, bits + offset);
-    uint8_t a, r, g, b;
+    uint32_t a, r, g, b;
 
     a = (tmp >> 24) & 0xff;
     r = (tmp >> 16) & 0xff;
     g = (tmp >> 8) & 0xff;
     b = (tmp >> 0) & 0xff;
 
     r = to_linear[r] * 255.0f + 0.5f;
     g = to_linear[g] * 255.0f + 0.5f;
@@ -1237,17 +1314,17 @@ store_scanline_a8r8g8b8_32_sRGB (bits_im
     uint32_t *bits = image->bits + image->rowstride * y;
     uint64_t *values = (uint64_t *)v;
     uint32_t *pixel = bits + x;
     uint64_t tmp;
     int i;
     
     for (i = 0; i < width; ++i)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	tmp = values[i];
 
 	a = (tmp >> 24) & 0xff;
 	r = (tmp >> 16) & 0xff;
 	g = (tmp >> 8) & 0xff;
 	b = (tmp >> 0) & 0xff;
 
@@ -1289,81 +1366,62 @@ fetch_pixel_generic_lossy_32 (bits_image
     pixman_contract_from_float (&result, &pixel64, 1);
 
     return result;
 }
 
 typedef struct
 {
     pixman_format_code_t	format;
-    fetch_scanline_t		fetch_scanline_16;
     fetch_scanline_t		fetch_scanline_32;
     fetch_scanline_t		fetch_scanline_float;
     fetch_pixel_32_t		fetch_pixel_32;
     fetch_pixel_float_t		fetch_pixel_float;
-    store_scanline_t		store_scanline_16;
     store_scanline_t		store_scanline_32;
     store_scanline_t		store_scanline_float;
 } format_info_t;
 
 #define FORMAT_INFO(format) 						\
     {									\
 	PIXMAN_ ## format,						\
-	    NULL,							\
 	    fetch_scanline_ ## format,					\
 	    fetch_scanline_generic_float,				\
 	    fetch_pixel_ ## format,					\
 	    fetch_pixel_generic_float,					\
-	    NULL,							\
 	    store_scanline_ ## format,					\
 	    store_scanline_generic_float				\
     }
-#define FORMAT_INFO16(format) 						\
-    {									\
-	PIXMAN_ ## format,						\
-	    fetch_scanline_16,						\
-	    fetch_scanline_ ## format,					\
-	    fetch_scanline_generic_float,				\
-	    fetch_pixel_ ## format,					\
-	    fetch_pixel_generic_float,					\
-	    store_scanline_16,						\
-	    store_scanline_ ## format,					\
-	    store_scanline_generic_float				\
-    }
-
 
 static const format_info_t accessors[] =
 {
 /* 32 bpp formats */
     FORMAT_INFO (a8r8g8b8),
     FORMAT_INFO (x8r8g8b8),
     FORMAT_INFO (a8b8g8r8),
     FORMAT_INFO (x8b8g8r8),
     FORMAT_INFO (b8g8r8a8),
     FORMAT_INFO (b8g8r8x8),
     FORMAT_INFO (r8g8b8a8),
     FORMAT_INFO (r8g8b8x8),
     FORMAT_INFO (x14r6g6b6),
 
 /* sRGB formats */
   { PIXMAN_a8r8g8b8_sRGB,
-    NULL,
     fetch_scanline_a8r8g8b8_32_sRGB, fetch_scanline_a8r8g8b8_sRGB_float,
     fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float,
-    NULL,
     store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float,
   },
 
 /* 24bpp formats */
     FORMAT_INFO (r8g8b8),
     FORMAT_INFO (b8g8r8),
     
 /* 16bpp formats */
-    FORMAT_INFO16 (r5g6b5),
-    FORMAT_INFO16 (b5g6r5),
+    FORMAT_INFO (r5g6b5),
+    FORMAT_INFO (b5g6r5),
     
     FORMAT_INFO (a1r5g5b5),
     FORMAT_INFO (x1r5g5b5),
     FORMAT_INFO (a1b5g5r5),
     FORMAT_INFO (x1b5g5r5),
     FORMAT_INFO (a4r4g4b4),
     FORMAT_INFO (x4r4g4b4),
     FORMAT_INFO (a4b4g4r4),
@@ -1403,66 +1461,75 @@ static const format_info_t accessors[] =
     
     FORMAT_INFO (g4),
     
 /* 1bpp formats */
     FORMAT_INFO (a1),
     FORMAT_INFO (g1),
     
 /* Wide formats */
-    
+#ifndef PIXMAN_FB_ACCESSORS
+    { PIXMAN_rgba_float,
+      NULL, fetch_scanline_rgbaf_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_rgbaf_float,
+      NULL, store_scanline_rgbaf_float },
+
+    { PIXMAN_rgb_float,
+      NULL, fetch_scanline_rgbf_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_rgbf_float,
+      NULL, store_scanline_rgbf_float },
+#endif
+
     { PIXMAN_a2r10g10b10,
-      NULL, NULL, fetch_scanline_a2r10g10b10_float,
+      NULL, fetch_scanline_a2r10g10b10_float,
       fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10_float,
-      NULL, NULL, store_scanline_a2r10g10b10_float },
+      NULL, store_scanline_a2r10g10b10_float },
 
     { PIXMAN_x2r10g10b10,
-      NULL, NULL, fetch_scanline_x2r10g10b10_float,
+      NULL, fetch_scanline_x2r10g10b10_float,
       fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10_float,
-      NULL, NULL, store_scanline_x2r10g10b10_float },
+      NULL, store_scanline_x2r10g10b10_float },
 
     { PIXMAN_a2b10g10r10,
-      NULL, NULL, fetch_scanline_a2b10g10r10_float,
+      NULL, fetch_scanline_a2b10g10r10_float,
       fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10_float,
-      NULL, NULL, store_scanline_a2b10g10r10_float },
+      NULL, store_scanline_a2b10g10r10_float },
 
     { PIXMAN_x2b10g10r10,
-      NULL, NULL, fetch_scanline_x2b10g10r10_float,
+      NULL, fetch_scanline_x2b10g10r10_float,
       fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10_float,
-      NULL, NULL, store_scanline_x2b10g10r10_float },
+      NULL, store_scanline_x2b10g10r10_float },
 
 /* YUV formats */
     { PIXMAN_yuy2,
-      NULL, fetch_scanline_yuy2, fetch_scanline_generic_float,
+      fetch_scanline_yuy2, fetch_scanline_generic_float,
       fetch_pixel_yuy2, fetch_pixel_generic_float,
-      NULL, NULL, NULL },
+      NULL, NULL },
 
     { PIXMAN_yv12,
-      NULL, fetch_scanline_yv12, fetch_scanline_generic_float,
+      fetch_scanline_yv12, fetch_scanline_generic_float,
       fetch_pixel_yv12, fetch_pixel_generic_float,
-      NULL, NULL, NULL },
+      NULL, NULL },
     
     { PIXMAN_null },
 };
 
 static void
 setup_accessors (bits_image_t *image)
 {
     const format_info_t *info = accessors;
     
     while (info->format != PIXMAN_null)
     {
 	if (info->format == image->format)
 	{
-	    image->fetch_scanline_16 = info->fetch_scanline_16;
 	    image->fetch_scanline_32 = info->fetch_scanline_32;
 	    image->fetch_scanline_float = info->fetch_scanline_float;
 	    image->fetch_pixel_32 = info->fetch_pixel_32;
 	    image->fetch_pixel_float = info->fetch_pixel_float;
-	    image->store_scanline_16 = info->store_scanline_16;
 	    image->store_scanline_32 = info->store_scanline_32;
 	    image->store_scanline_float = info->store_scanline_float;
 	    
 	    return;
 	}
 	
 	info++;
     }
new file mode 100644
--- /dev/null
+++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
--- a/gfx/cairo/libpixman/src/pixman-arm-common.h
+++ b/gfx/cairo/libpixman/src/pixman-arm-common.h
@@ -261,23 +261,16 @@ FAST_NEAREST_MAINLOOP (cputype##_##name#
                        src_type, dst_type, NONE)                              \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
                        src_type, dst_type, PAD)                               \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
                        src_type, dst_type, NORMAL)
 
-/* Provide entries for the fast path table */
-#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
 #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
                                                   src_type, dst_type)         \
 void                                                                          \
 pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    int32_t          w,        \
                                                    dst_type *       dst,      \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
@@ -313,19 +306,17 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
                               src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \
 FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
                               src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH (op,s,d,func),                           \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
 
 /*****************************************************************************/
 
 #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
                                                 src_type, dst_type)           \
 void                                                                          \
 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
@@ -355,26 +346,26 @@ scaled_bilinear_scanline_##cputype##_##n
     if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
 	return;                                                               \
     pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
                             dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
 }                                                                             \
                                                                               \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE)  \
+                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE)   \
+                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE)    \
+                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint32_t, dst_type, NORMAL,            \
+                       src_type, uint32_t, dst_type, NORMAL,                  \
                        FLAG_NONE)
 
 
 #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
                                                 src_type, dst_type)           \
 void                                                                          \
 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
                                                 dst_type *       dst,         \
@@ -404,25 +395,25 @@ scaled_bilinear_scanline_##cputype##_##n
     if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
 	return;                                                                   \
     pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
                       dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
 }                                                                             \
                                                                               \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint8_t, dst_type, COVER,              \
+                       src_type, uint8_t, dst_type, COVER,                    \
                        FLAG_HAVE_NON_SOLID_MASK)                              \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint8_t, dst_type, NONE,               \
+                       src_type, uint8_t, dst_type, NONE,                     \
                        FLAG_HAVE_NON_SOLID_MASK)                              \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint8_t, dst_type, PAD,                \
+                       src_type, uint8_t, dst_type, PAD,                      \
                        FLAG_HAVE_NON_SOLID_MASK)                              \
 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       NULL, src_type, uint8_t, dst_type, NORMAL,             \
+                       src_type, uint8_t, dst_type, NORMAL,                   \
                        FLAG_HAVE_NON_SOLID_MASK)
 
 
 #endif
deleted file mode 100644
--- a/gfx/cairo/libpixman/src/pixman-arm-detect-win32.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-    area pixman_msvc, code, readonly
-
-    export  pixman_msvc_try_arm_simd_op
-
-pixman_msvc_try_arm_simd_op
-    ;; I don't think the msvc arm asm knows how to do SIMD insns
-    ;; uqadd8 r3,r3,r3
-    dcd 0xe6633f93
-    mov pc,lr
-    endp
-
-    export  pixman_msvc_try_arm_neon_op
-
-pixman_msvc_try_arm_neon_op
-    ;; I don't think the msvc arm asm knows how to do NEON insns
-    ;; veor d0,d0,d0
-    dcd 0xf3000110
-    mov pc,lr
-    endp
-
-    end
--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
@@ -60,33 +60,23 @@
 .object_arch armv4
 .eabi_attribute 10, 0
 .eabi_attribute 12, 0
 .arm
 .altmacro
 .p2align 2
 
 #include "pixman-private.h"
+#include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
 /*
  * Bilinear macros from pixman-arm-neon-asm.S
  */
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-    .func fname
-    .global fname
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-.endm
-
 /*
  * Bilinear scaling support code which tries to provide pixel fetching, color
  * format conversion, and interpolation as separate macros which can be used
  * as the basic building blocks for constructing bilinear scanline functions.
  */
 
 .macro bilinear_load_8888 reg1, reg2, tmp
     mov       TMP1, X, asr #16
--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
@@ -45,16 +45,17 @@
     .object_arch armv4
     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
     .arm
     .altmacro
     .p2align 2
 
 #include "pixman-private.h"
+#include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
 /* Global configuration options and preferences */
 
 /*
  * The code can optionally make use of unaligned memory accesses to improve
  * performance of handling leading/trailing pixels for each scanline.
  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
@@ -949,17 +950,16 @@ generate_composite_function \
  * registers. Additionally, this function needs all the NEON registers,
  * so it has to save d8-d15 registers which are callee saved according
  * to ABI. These registers are restored from 'cleanup' macro. All the
  * other NEON registers are caller saved, so can be clobbered freely
  * without introducing any problems.
  */
 .macro pixman_composite_over_n_8_0565_init
     add         DUMMY, sp, #ARGS_STACK_OFFSET
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d11[0]}, [DUMMY]
     vdup.8      d8, d11[0]
     vdup.8      d9, d11[1]
     vdup.8      d10, d11[2]
     vdup.8      d11, d11[3]
 .endm
 
@@ -977,17 +977,16 @@ generate_composite_function \
     pixman_composite_over_8888_8_0565_process_pixblock_head, \
     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
     pixman_composite_over_8888_8_0565_process_pixblock_tail_head
 
 /******************************************************************************/
 
 .macro pixman_composite_over_8888_n_0565_init
     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d24[0]}, [DUMMY]
     vdup.8      d24, d24[3]
 .endm
 
 .macro pixman_composite_over_8888_n_0565_cleanup
     vpop        {d8-d15}
 .endm
@@ -1444,17 +1443,16 @@ generate_composite_function \
     vmull.u8    q8, d25, d4
     vmull.u8    q9, d25, d5
     vmull.u8    q10, d25, d6
     vmull.u8    q11, d25, d7
 .endm
 
 .macro pixman_composite_over_n_8_8888_init
     add         DUMMY, sp, #ARGS_STACK_OFFSET
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d11[0]}, [DUMMY]
     vdup.8      d8, d11[0]
     vdup.8      d9, d11[1]
     vdup.8      d10, d11[2]
     vdup.8      d11, d11[3]
 .endm
 
@@ -1516,17 +1514,16 @@ generate_composite_function \
     fetch_mask_pixblock
     cache_preload 32, 32
     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
     pixman_composite_over_n_8_8_process_pixblock_head
 .endm
 
 .macro pixman_composite_over_n_8_8_init
     add         DUMMY, sp, #ARGS_STACK_OFFSET
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d8[0]}, [DUMMY]
     vdup.8      d8, d8[3]
 .endm
 
 .macro pixman_composite_over_n_8_8_cleanup
     vpop        {d8-d15}
 .endm
@@ -1618,17 +1615,16 @@ generate_composite_function \
         vqadd.u8    q15, q1, q15
     cache_preload 8, 8
     pixman_composite_over_n_8888_8888_ca_process_pixblock_head
     vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
 .endm
 
 .macro pixman_composite_over_n_8888_8888_ca_init
     add         DUMMY, sp, #ARGS_STACK_OFFSET
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d11[0]}, [DUMMY]
     vdup.8      d8, d11[0]
     vdup.8      d9, d11[1]
     vdup.8      d10, d11[2]
     vdup.8      d11, d11[3]
 .endm
 
@@ -1788,17 +1784,16 @@ generate_composite_function \
             vmull.u8    q7,  d17, d25
             vmull.u8    q6,  d16, d24
             vmull.u8    q11, d18, d26
     vst1.16     {d28, d29}, [DST_W, :128]!
 .endm
 
 .macro pixman_composite_over_n_8888_0565_ca_init
     add         DUMMY, sp, #ARGS_STACK_OFFSET
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d11[0]}, [DUMMY]
     vdup.8      d8, d11[0]
     vdup.8      d9, d11[1]
     vdup.8      d10, d11[2]
     vdup.8      d11, d11[3]
 .endm
 
@@ -1902,17 +1897,16 @@ generate_composite_function \
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
     fetch_mask_pixblock
     cache_preload 32, 32
     pixman_composite_add_n_8_8_process_pixblock_head
 .endm
 
 .macro pixman_composite_add_n_8_8_init
     add         DUMMY, sp, #ARGS_STACK_OFFSET
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d11[0]}, [DUMMY]
     vdup.8      d11, d11[3]
 .endm
 
 .macro pixman_composite_add_n_8_8_cleanup
     vpop        {d8-d15}
 .endm
@@ -2209,17 +2203,16 @@ generate_composite_function_single_scanl
     fetch_src_pixblock
     cache_preload 8, 8
     pixman_composite_over_8888_n_8888_process_pixblock_head
     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
 .endm
 
 .macro pixman_composite_over_8888_n_8888_init
     add         DUMMY, sp, #48
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d15[0]}, [DUMMY]
     vdup.8      d15, d15[3]
 .endm
 
 .macro pixman_composite_over_8888_n_8888_cleanup
     vpop        {d8-d15}
 .endm
@@ -2582,17 +2575,16 @@ generate_composite_function \
     10,  /* dst_r_basereg */ \
     8,  /* src_basereg   */ \
     15  /* mask_basereg  */
 
 /******************************************************************************/
 
 .macro pixman_composite_over_0565_n_0565_init
     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    .vsave      {d8-d15}
     vpush       {d8-d15}
     vld1.32     {d15[0]}, [DUMMY]
     vdup.8      d15, d15[3]
 .endm
 
 .macro pixman_composite_over_0565_n_0565_cleanup
     vpop        {d8-d15}
 .endm
@@ -2834,27 +2826,16 @@ generate_composite_function_nearest_scan
     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
     28, /* dst_w_basereg */ \
     10,  /* dst_r_basereg */ \
     8,  /* src_basereg   */ \
     15  /* mask_basereg  */
 
 /******************************************************************************/
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-    .func fname
-    .global fname
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-.endm
-
 /*
  * Bilinear scaling support code which tries to provide pixel fetching, color
  * format conversion, and interpolation as separate macros which can be used
  * as the basic building blocks for constructing bilinear scanline functions.
  */
 
 .macro bilinear_load_8888 reg1, reg2, tmp
     mov       TMP1, X, asr #16
@@ -3136,26 +3117,23 @@ pixman_asm_function fname
     WIDTH     .req      ip
     TMP1      .req      r3
     TMP2      .req      r4
     PF_OFFS   .req      r7
     TMP3      .req      r8
     TMP4      .req      r9
     STRIDE    .req      r2
 
-    .fnstart
     mov       ip, sp
-    .save     {r4, r5, r6, r7, r8, r9}
     push      {r4, r5, r6, r7, r8, r9}
     mov       PF_OFFS, #prefetch_distance
     ldmia     ip, {WB, X, UX, WIDTH}
     mul       PF_OFFS, PF_OFFS, UX
 
 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
-    .vsave    {d8-d15}
     vpush     {d8-d15}
 .endif
 
     sub       STRIDE, BOTTOM, TOP
     .unreq    BOTTOM
 
     cmp       WIDTH, #0
     ble       3f
@@ -3239,17 +3217,16 @@ 2:
     beq       3f
     bilinear_interpolate_last_pixel src_fmt, dst_fmt
 3:
 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpop      {d8-d15}
 .endif
     pop       {r4, r5, r6, r7, r8, r9}
     bx        lr
-    .fnend
 
     .unreq    OUT
     .unreq    TOP
     .unreq    WT
     .unreq    WB
     .unreq    X
     .unreq    UX
     .unreq    WIDTH
--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
@@ -380,17 +380,17 @@ 5:  subpls  VX, VX, SRC_WIDTH_FIXED
  * miss in this case, and PLD would be useless.
  *
  * This sounds like it may introduce a noticeable overhead (when working with
  * fully cached data). But in reality, due to having a separate pipeline and
  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
  * execute simultaneously with NEON and be completely shadowed by it. Thus
  * we get no performance overhead at all (*). This looks like a very nice
  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
- * but still can implement some rather advanced prefetch logic in sofware
+ * but still can implement some rather advanced prefetch logic in software
  * for almost zero cost!
  *
  * (*) The overhead of the prefetcher is visible when running some trivial
  * pixels processing like simple copy. Anyway, having prefetch is a must
  * when working with the graphics data.
  */
 .macro PF a, x:vararg
 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
@@ -626,26 +626,18 @@ 2:
                                    process_pixblock_head, \
                                    process_pixblock_tail, \
                                    process_pixblock_tail_head, \
                                    dst_w_basereg_ = 28, \
                                    dst_r_basereg_ = 4, \
                                    src_basereg_   = 0, \
                                    mask_basereg_  = 24
 
-    .func fname
-    .global fname
-    /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-    .fnstart
-    .save       {r4-r12, lr}
+    pixman_asm_function fname
+
     push        {r4-r12, lr}        /* save all registers */
 
 /*
  * Select prefetch type for this function. If prefetch distance is
  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
  * has to be used instead of ADVANCED.
  */
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
@@ -813,17 +805,16 @@ fname:
     PF mov      PF_DST, DST_R
     PF mov      PF_MASK, MASK
     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
     PF mov      PF_CTL, H, lsl #4
     PF add      PF_CTL, #(prefetch_distance - 0x10)
 
     init
 .if regs_shortage
-    .save       {r0, r1}
     push        {r0, r1}
 .endif
     subs        H, H, #1
 .if regs_shortage
     str         H, [sp, #4] /* save updated height to stack */
 .else
     mov         ORIG_W, W
 .endif
@@ -899,17 +890,16 @@ 1:
                             process_pixblock_tail_head
     advance_to_next_scanline 8b
 9:
 .if regs_shortage
     pop         {r0, r1}
 .endif
     cleanup
     pop         {r4-r12, pc}  /* exit */
-    .fnend
 
     .purgem     fetch_src_pixblock
     .purgem     pixld_src
 
     .unreq      SRC
     .unreq      MASK
     .unreq      DST_R
     .unreq      DST_W
@@ -944,25 +934,18 @@ 9:
                                                    process_pixblock_head, \
                                                    process_pixblock_tail, \
                                                    process_pixblock_tail_head, \
                                                    dst_w_basereg_ = 28, \
                                                    dst_r_basereg_ = 4, \
                                                    src_basereg_   = 0, \
                                                    mask_basereg_  = 24
 
-    .func fname
-    .global fname
-    /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-    .fnstart
+    pixman_asm_function fname
+
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 /*
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
     .set src_bpp, src_bpp_
     .set mask_bpp, mask_bpp_
     .set dst_w_bpp, dst_w_bpp_
@@ -987,17 +970,16 @@ fname:
     DST_R       .req        r6
     SRC_WIDTH_FIXED .req        r7
 
     .macro pixld_src x:vararg
         pixld_s x
     .endm
 
     ldr         UNIT_X, [sp]
-    .save       {r4-r8, lr}
     push        {r4-r8, lr}
     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
     .if mask_bpp != 0
     ldr         MASK, [sp, #(24 + 8)]
     .endif
 .else
     /*
      * Assign symbolic names to registers
@@ -1103,17 +1085,16 @@ 8:
     .unreq      DST_R
     .unreq      DST_W
     .unreq      W
 .endif
 
     .purgem     fetch_src_pixblock
     .purgem     pixld_src
 
-    .fnend
     .endfunc
 .endm
 
 .macro generate_composite_function_single_scanline x:vararg
     generate_composite_function_scanline 0, x
 .endm
 
 .macro generate_composite_function_nearest_scanline x:vararg
@@ -1130,17 +1111,16 @@ 8:
 
 /*
  * Prologue/epilogue variant which additionally saves/restores d8-d15
  * registers (they need to be saved/restored by callee according to ABI).
  * This is required if the code needs to use all the NEON registers.
  */
 
 .macro default_init_need_all_regs
-    .vsave      {d8-d15}
     vpush       {d8-d15}
 .endm
 
 .macro default_cleanup_need_all_regs
     vpop        {d8-d15}
 .endm
 
 /******************************************************************************/
--- a/gfx/cairo/libpixman/src/pixman-arm-neon.c
+++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c
@@ -140,33 +140,16 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST 
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
                                          uint32_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
                                          uint16_t, uint32_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
                                          uint16_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
                                          uint32_t, uint32_t)
-static force_inline void
-pixman_scaled_bilinear_scanline_8888_8888_SRC (
-                                                uint32_t *       dst,
-                                                const uint32_t * mask,
-                                                const uint32_t * src_top,
-                                                const uint32_t * src_bottom,
-                                                int32_t          w,
-                                                int              wt,
-                                                int              wb,
-                                                pixman_fixed_t   vx,
-                                                pixman_fixed_t   unit_x,
-                                                pixman_fixed_t   max_vx,
-                                                pixman_bool_t    zero_src)
-{
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w);
-}
-
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
                                          uint32_t, uint32_t)
 
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
                                             uint32_t, uint32_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
                                             uint32_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
@@ -280,38 +263,16 @@ arm_neon_blt (pixman_implementation_t *i
 		(uint32_t *)(((char *) src_bits) +
 		src_y * src_stride * 4 + src_x * 4), src_stride);
 	return TRUE;
     default:
 	return FALSE;
     }
 }
 
-static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
-{
-    pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0);
-}
-
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER,
-			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
-			       uint32_t, uint32_t, uint16_t,
-			       COVER, FLAG_NONE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER,
-			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
-			       uint32_t, uint32_t, uint16_t,
-			       PAD, FLAG_NONE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER,
-			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
-			       uint32_t, uint32_t, uint16_t,
-			       NONE, FLAG_NONE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER,
-			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
-			       uint32_t, uint32_t, uint16_t,
-			       NORMAL, FLAG_NONE)
-
 static const pixman_fast_path_t arm_neon_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
     PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
     PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
     PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
     PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
     PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
@@ -396,31 +357,31 @@ static const pixman_fast_path_t arm_neon
     PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
     /* Note: NONE repeat is not supported yet */
     SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
     SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
 
     PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
@@ -455,18 +416,16 @@ static const pixman_fast_path_t arm_neon
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
 
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
 
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
 
-    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
-
     { PIXMAN_OP_NONE },
 };
 
 #define BIND_COMBINE_U(name)                                             \
 void                                                                     \
 pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
                                                   const uint32_t *dst,   \
                                                   const uint32_t *src,   \
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
@@ -32,26 +32,17 @@
 
 	.text
 	.arch armv6
 	.object_arch armv4
 	.arm
 	.altmacro
 	.p2align 2
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-	.func fname
-	.global fname
-#ifdef __ELF__
-	.hidden fname
-	.type fname, %function
-#endif
-fname:
-.endm
+#include "pixman-arm-asm.h"
 
 /*
  * Note: This code is only using armv5te instructions (not even armv6),
  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
  *       be split into a few variants, tuned for each microarchitecture.
  *
  * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
  * have efficient write combining), it needs to be changed to use 16-byte
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
@@ -32,16 +32,17 @@
 
 	.text
 	.arch armv6
 	.object_arch armv4
 	.arm
 	.altmacro
 	.p2align 2
 
+#include "pixman-arm-asm.h"
 #include "pixman-arm-simd-asm.h"
 
 /* A head macro should do all processing which results in an output of up to
  * 16 bytes, as far as the final load instruction. The corresponding tail macro
  * should complete the processing of the up-to-16 bytes. The calling macro will
  * sometimes choose to insert a preload or a decrement of X between them.
  *   cond           ARM condition code for code block
  *   numbytes       Number of output bytes that should be generated this time
@@ -298,16 +299,93 @@ generate_composite_function \
     src_0565_8888_init, \
     nop_macro, /* newline */ \
     nop_macro, /* cleanup */ \
     src_0565_8888_process_head, \
     src_0565_8888_process_tail
 
 /******************************************************************************/
 
+.macro src_x888_0565_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x001F001F
+        line_saved_regs  STRIDE_S, ORIG_W
+.endm
+
+.macro src_x888_0565_1pixel  s, d
+        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
+        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
+        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
+        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        /* Top 16 bits are discarded during the following STRH */
+.endm
+
+.macro src_x888_0565_2pixels  slo, shi, d, tmp
+        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
+        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
+        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
+        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
+        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
+        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
+        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
+        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
+        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+.endm
+
+.macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        WK4     .req    STRIDE_S
+        WK5     .req    STRIDE_M
+        WK6     .req    WK3
+        WK7     .req    ORIG_W
+ .if numbytes == 16
+        pixld   , 16, 4, SRC, 0
+        src_x888_0565_2pixels  4, 5, 0, 0
+        pixld   , 8, 4, SRC, 0
+        src_x888_0565_2pixels  6, 7, 1, 1
+        pixld   , 8, 6, SRC, 0
+ .else
+        pixld   , numbytes*2, 4, SRC, 0
+ .endif
+.endm
+
+.macro src_x888_0565_process_tail   cond, numbytes, firstreg
+ .if numbytes == 16
+        src_x888_0565_2pixels  4, 5, 2, 2
+        src_x888_0565_2pixels  6, 7, 3, 4
+ .elseif numbytes == 8
+        src_x888_0565_2pixels  4, 5, 1, 1
+        src_x888_0565_2pixels  6, 7, 2, 2
+ .elseif numbytes == 4
+        src_x888_0565_2pixels  4, 5, 1, 1
+ .else
+        src_x888_0565_1pixel  4, 1
+ .endif
+ .if numbytes == 16
+        pixst   , numbytes, 0, DST
+ .else
+        pixst   , numbytes, 1, DST
+ .endif
+        .unreq  WK4
+        .unreq  WK5
+        .unreq  WK6
+        .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+    3, /* prefetch distance */ \
+    src_x888_0565_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_x888_0565_process_head, \
+    src_x888_0565_process_tail
+
+/******************************************************************************/
+
 .macro add_8_8_8pixels  cond, dst1, dst2
         uqadd8&cond  WK&dst1, WK&dst1, MASK
         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
 .endm
 
 .macro add_8_8_4pixels  cond, dst
         uqadd8&cond  WK&dst, WK&dst, MASK
 .endm
@@ -606,8 +684,496 @@ generate_composite_function \
     over_n_8_8888_init, \
     over_n_8_8888_newline, \
     nop_macro, /* cleanup */ \
     over_n_8_8888_process_head, \
     over_n_8_8888_process_tail
 
 /******************************************************************************/
 
+.macro over_reverse_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        ldr     MASK, =0x00800080
+        /* Split source pixel into RB/AG parts */
+        uxtb16  STRIDE_S, SRC
+        uxtb16  STRIDE_M, SRC, ror #8
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        line_saved_regs  STRIDE_D, ORIG_W
+.endm
+
+.macro over_reverse_n_8888_newline
+        mov     STRIDE_D, #0xFF
+.endm
+
+.macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_reverse_n_8888_1pixel  d, is_only
+        teq     WK&d, #0
+        beq     8f       /* replace with source */
+        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
+ .if is_only == 1
+        beq     49f      /* skip store */
+ .else
+        beq     9f       /* write same value back */
+ .endif
+        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
+        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     ORIG_W, SCRATCH, ORIG_W
+        uqadd8  WK&d, WK&d, ORIG_W
+        b       9f
+8:      mov     WK&d, SRC
+9:
+.endm
+
+.macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+        over_reverse_n_8888_1pixel  reg1, 1
+ .else
+        and     SCRATCH, WK&reg1, WK&reg2
+  .if numbytes == 16
+        and     SCRATCH, SCRATCH, WK&reg3
+        and     SCRATCH, SCRATCH, WK&reg4
+  .endif
+        mvns    SCRATCH, SCRATCH, asr #24
+        beq     49f /* skip store if all opaque */
+        over_reverse_n_8888_1pixel  reg1, 0
+        over_reverse_n_8888_1pixel  reg2, 0
+  .if numbytes == 16
+        over_reverse_n_8888_1pixel  reg3, 0
+        over_reverse_n_8888_1pixel  reg4, 0
+  .endif
+ .endif
+        pixst   , numbytes, reg1, DST
+49:
+.endm
+
+.macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
+        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+    3, /* prefetch distance */ \
+    over_reverse_n_8888_init, \
+    over_reverse_n_8888_newline, \
+    nop_macro, /* cleanup */ \
+    over_reverse_n_8888_process_head, \
+    over_reverse_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_white_8888_8888_ca_init
+        HALF    .req    SRC
+        TMP0    .req    STRIDE_D
+        TMP1    .req    STRIDE_S
+        TMP2    .req    STRIDE_M
+        TMP3    .req    ORIG_W
+        WK4     .req    SCRATCH
+        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
+        ldr     SCRATCH, =0x800080
+        mov     HALF, #0x80
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        .set DST_PRELOAD_BIAS, 8
+.endm
+
+.macro over_white_8888_8888_ca_cleanup
+        .set DST_PRELOAD_BIAS, 0
+        .unreq  HALF
+        .unreq  TMP0
+        .unreq  TMP1
+        .unreq  TMP2
+        .unreq  TMP3
+        .unreq  WK4
+.endm
+
+.macro over_white_8888_8888_ca_combine  m, d
+        uxtb16  TMP1, TMP0                /* rb_notmask */
+        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
+        smlatt  TMP3, TMP2, TMP1, HALF    /* red */
+        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
+        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
+        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
+        smlatt  d, TMP1, TMP0, HALF       /* alpha */
+        smlabb  TMP1, TMP1, TMP0, HALF    /* green */
+        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
+        uxtab16 TMP0, TMP0, TMP0, ror #8
+        uxtab16 TMP1, TMP1, TMP1, ror #8
+        mov     TMP0, TMP0, ror #8
+        sel     d, TMP0, TMP1
+        uqadd8  d, d, m                   /* d is a late result */
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_head
+        pixld   , 4, 1, MASK, 0
+        pixld   , 4, 3, DST, 0
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_tail
+        mvn     TMP0, WK1
+        teq     WK1, WK1, asr #32
+        bne     01f
+        bcc     03f
+        mov     WK3, WK1
+        b       02f
+01:     over_white_8888_8888_ca_combine WK1, WK3
+02:     pixst   , 4, 3, DST
+03:
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_head
+        pixld   , 8, 1, MASK, 0
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_tail
+        pixld   , 8, 3, DST
+        mvn     TMP0, WK1
+        teq     WK1, WK1, asr #32
+        bne     01f
+        movcs   WK3, WK1
+        bcs     02f
+        teq     WK2, #0
+        beq     05f
+        b       02f
+01:     over_white_8888_8888_ca_combine WK1, WK3
+02:     mvn     TMP0, WK2
+        teq     WK2, WK2, asr #32
+        bne     03f
+        movcs   WK4, WK2
+        b       04f
+03:     over_white_8888_8888_ca_combine WK2, WK4
+04:     pixst   , 8, 3, DST
+05:
+.endm
+
+.macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 4
+        over_white_8888_8888_ca_1pixel_head
+ .else
+  .if numbytes == 16
+        over_white_8888_8888_ca_2pixels_head
+        over_white_8888_8888_ca_2pixels_tail
+  .endif
+        over_white_8888_8888_ca_2pixels_head
+ .endif
+.endm
+
+.macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
+ .if numbytes == 4
+        over_white_8888_8888_ca_1pixel_tail
+ .else
+        over_white_8888_8888_ca_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
+    2, /* prefetch distance */ \
+    over_white_8888_8888_ca_init, \
+    nop_macro, /* newline */ \
+    over_white_8888_8888_ca_cleanup, \
+    over_white_8888_8888_ca_process_head, \
+    over_white_8888_8888_ca_process_tail
+
+
+.macro over_n_8888_8888_ca_init
+        /* Set up constants. RB_SRC and AG_SRC are in registers;
+         * RB_FLDS, A_SRC, and the two HALF values need to go on the
+         * stack (and the ful SRC value is already there) */
+        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
+        mov     WK0, #0x00FF0000
+        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
+        mov     WK1, #0x80             /* HALF default value */
+        mov     WK2, SCRATCH, lsr #24  /* A_SRC */
+        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
+        push    {WK0-WK3}
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
+        uxtb16  SRC, SCRATCH
+        uxtb16  STRIDE_S, SCRATCH, ror #8
+
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, WK3, WK3
+
+        .unreq  WK0
+        .unreq  WK1
+        .unreq  WK2
+        .unreq  WK3
+        WK0     .req    Y
+        WK1     .req    STRIDE_D
+        RB_SRC  .req    SRC
+        AG_SRC  .req    STRIDE_S
+        WK2     .req    STRIDE_M
+        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
+        A_SRC   .req    r8
+        HALF    .req    r9
+        WK3     .req    r10
+        WK4     .req    r11
+        WK5     .req    SCRATCH
+        WK6     .req    ORIG_W
+
+        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8888_8888_ca_cleanup
+        add     sp, sp, #16
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
+
+        .unreq  WK0
+        .unreq  WK1
+        .unreq  RB_SRC
+        .unreq  AG_SRC
+        .unreq  WK2
+        .unreq  RB_FLDS
+        .unreq  A_SRC
+        .unreq  HALF
+        .unreq  WK3
+        .unreq  WK4
+        .unreq  WK5
+        .unreq  WK6
+        WK0     .req    r8
+        WK1     .req    r9
+        WK2     .req    r10
+        WK3     .req    r11
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_head
+        pixld   , 4, 6, MASK, 0
+        pixld   , 4, 0, DST, 0
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_tail
+        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
+        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
+        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
+        bne     20f
+        bcc     40f
+        /* Mask is fully opaque (all channels) */
+        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
+        eors    A_SRC, A_SRC, #0xFF
+        bne     10f
+        /* Source is also opaque - same as src_8888_8888 */
+        mov     WK0, WK6
+        b       30f
+10:     /* Same as over_8888_8888 */
+        mul_8888_8 WK0, A_SRC, WK5, HALF
+        uqadd8  WK0, WK0, WK6
+        b       30f
+20:     /* No simplifications possible - do it the hard way */
+        uxtb16  WK2, WK6, ror #8         /* ag_mask */
+        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
+        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
+        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
+        uxtb16  WK5, WK0                 /* rb_dest */
+        uxtab16 WK3, WK3, WK3, ror #8
+        uxtb16  WK6, WK0, ror #8         /* ag_dest */
+        uxtab16 WK4, WK4, WK4, ror #8
+        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
+        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
+        bic     WK3, RB_FLDS, WK3, lsr #8
+        bic     WK4, RB_FLDS, WK4, lsr #8
+        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
+        smlatt  WK0, WK5, WK3, HALF      /* red2 */
+        smlabb  WK3, WK5, WK3, HALF      /* blue2 */
+        uxtab16 WK1, WK1, WK1, ror #8
+        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
+        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
+        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
+        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
+        smlabb  WK4, WK6, WK4, HALF      /* green2 */
+        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
+        uxtab16 WK3, WK3, WK3, ror #8
+        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
+        uxtab16 WK0, WK0, WK0, ror #8
+        uxtab16 WK4, WK4, WK4, ror #8
+        mov     WK1, WK1, ror #8
+        mov     WK3, WK3, ror #8
+        sel     WK2, WK1, WK0            /* recombine source*mask */
+        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
+        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
+30:     /* The destination buffer is already in the L1 cache, so
+         * there's little point in amalgamating writes */
+        pixst   , 4, 0, DST
+40:
+.endm
+
+.macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .rept (numbytes / 4) - 1
+        over_n_8888_8888_ca_1pixel_head
+        over_n_8888_8888_ca_1pixel_tail
+ .endr
+        over_n_8888_8888_ca_1pixel_head
+.endm
+
+.macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
+        over_n_8888_8888_ca_1pixel_tail
+.endm
+
+pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+        ldr     ip, [sp]
+        cmp     ip, #-1
+        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
+        /* else drop through... */
+ .endfunc
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+    2, /* prefetch distance */ \
+    over_n_8888_8888_ca_init, \
+    nop_macro, /* newline */ \
+    over_n_8888_8888_ca_cleanup, \
+    over_n_8888_8888_ca_process_head, \
+    over_n_8888_8888_ca_process_tail
+
+/******************************************************************************/
+
+.macro in_reverse_8888_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        /* Offset the source pointer: we only need the alpha bytes */
+        add     SRC, SRC, #3
+        line_saved_regs  ORIG_W
+.endm
+
+.macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
+        ldrb    ORIG_W, [SRC], #4
+ .if numbytes >= 8
+        ldrb    WK&reg1, [SRC], #4
+  .if numbytes == 16
+        ldrb    WK&reg2, [SRC], #4
+        ldrb    WK&reg3, [SRC], #4
+  .endif
+ .endif
+        add     DST, DST, #numbytes
+.endm
+
+.macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+.endm
+
+.macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
+ .if is_only != 1
+        movs    s, ORIG_W
+  .if offset != 0
+        ldrb    ORIG_W, [SRC, #offset]
+  .endif
+        beq     01f
+        teq     STRIDE_M, #0xFF
+        beq     02f
+ .endif
+        uxtb16  SCRATCH, d                 /* rb_dest */
+        uxtb16  d, d, ror #8               /* ag_dest */
+        mla     SCRATCH, SCRATCH, s, MASK
+        mla     d, d, s, MASK
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 d, d, d, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     d, SCRATCH, d
+        b       02f
+ .if offset == 0
+48:     /* Last mov d,#0 of the set - used as part of shortcut for
+         * source values all 0 */
+ .endif
+01:     mov     d, #0
+02:
+.endm
+
+.macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+        teq     ORIG_W, ORIG_W, asr #32
+        ldrne   WK&reg1, [DST, #-4]
+ .elseif numbytes == 8
+        teq     ORIG_W, WK&reg1
+        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
+        ldmnedb DST, {WK&reg1-WK&reg2}
+ .else
+        teq     ORIG_W, WK&reg1
+        teqeq   ORIG_W, WK&reg2
+        teqeq   ORIG_W, WK&reg3
+        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
+        ldmnedb DST, {WK&reg1-WK&reg4}
+ .endif
+        cmnne   DST, #0   /* clear C if NE */
+        bcs     49f       /* no writes to dest if source all -1 */
+        beq     48f       /* set dest to all 0 if source all 0 */
+ .if numbytes == 4
+        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
+        str     WK&reg1, [DST, #-4]
+ .elseif numbytes == 8
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
+        stmdb   DST, {WK&reg1-WK&reg2}
+ .else
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
+        stmdb   DST, {WK&reg1-WK&reg4}
+ .endif
+49:
+.endm
+
+.macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
+        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
+    2, /* prefetch distance */ \
+    in_reverse_8888_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    in_reverse_8888_8888_process_head, \
+    in_reverse_8888_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Hold multiplier for destination in STRIDE_M */
+        mov     STRIDE_M, #255
+        sub     STRIDE_M, STRIDE_M, SRC, lsr #24
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+.endm
+
+.macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_n_8888_1pixel dst
+        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
+        uqadd8  WK&dst, WK&dst, SRC
+.endm
+
+.macro over_n_8888_process_tail  cond, numbytes, firstreg
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+        over_n_8888_1pixel %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
+    2, /* prefetch distance */ \
+    over_n_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_n_8888_process_head, \
+    over_n_8888_process_tail
+
+/******************************************************************************/
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
@@ -71,27 +71,42 @@
 .set FLAG_PROCESS_DOESNT_STORE,  0
 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
 .set FLAG_NO_SPILL_LINE_VARS,        0
 .set FLAG_SPILL_LINE_VARS_WIDE,      16
 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
 .set FLAG_SPILL_LINE_VARS,           48
 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0,     0
+.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
+.set FLAG_PRELOAD_DST,               0
+.set FLAG_NO_PRELOAD_DST,            256
+
+/*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
 
 /*
  * Offset into stack where mask and source pointer/stride can be accessed.
  */
 #ifdef DEBUG_PARAMS
 .set ARGS_STACK_OFFSET,        (9*4+9*4)
 #else
 .set ARGS_STACK_OFFSET,        (9*4)
 #endif
 
 /*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET,     0
+
+/*
  * Constants for selecting preferable prefetch type.
  */
 .set PREFETCH_TYPE_NONE,       0
 .set PREFETCH_TYPE_STANDARD,   1
 
 /*
  * Definitions of macros for load/store of pixel data.
  */
@@ -191,18 +206,18 @@
         /* The test can be simplified further when preloading the destination */
         PF  tst,    base, #16
         PF  beq,    61f
   .else
    .if bpp/dst_w_bpp == 4
         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
         PF  and,    SCRATCH, SCRATCH, #31
         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
-        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
-        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
+        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
+        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
         PF  bcs,    61f
         PF  bpl,    60f
         PF  pld,    [ptr, #32*(prefetch_distance+2)]
    .else
         PF  mov,    SCRATCH, base, lsl #32-5
         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
         PF  bls,    61f
@@ -354,33 +369,51 @@ 100:
         process_tail  cond2, numbytes2, firstreg2
         pixst   cond1, numbytes1, firstreg1, DST
         pixst   cond2, numbytes2, firstreg2, DST
  .endif
 .endm
 
 
 .macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
+ .else
         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
+ .endif
 .endm
 
 .macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
+ .else
         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
+ .endif
 .endm
 
 .macro leading_15bytes  process_head, process_tail
         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+  .set DECREMENT_X, 0
+        sub     X, X, WK0, lsr #dst_bpp_shift
+        str     X, [sp, #LINE_SAVED_REG_COUNT*4]
+        mov     X, WK0
+ .endif
         /* Use unaligned loads in all cases for simplicity */
  .if dst_w_bpp == 8
-        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
  .elseif dst_w_bpp == 16
         test_bits_1_0_ptr
-        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
+        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
  .endif
-        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
+        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
+ .endif
 .endm
 
 .macro test_bits_3_2_pix
         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
 .endm
 
 .macro test_bits_1_0_pix
  .if dst_w_bpp == 8
@@ -409,17 +442,17 @@ 110:
   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
         preload_middle  src_bpp, SRC, 1
   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
         preload_middle  mask_bpp, MASK, 1
   .else
         preload_middle  src_bpp, SRC, 0
         preload_middle  mask_bpp, MASK, 0
   .endif
-  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
          * preloads for, to achieve staggered prefetches for multiple channels, because there are
          * always two STMs per prefetch, so there is always an opposite STM on which to put the
          * preload. Note, no need to BIC the base register here */
         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
   .endif
         process_tail  , 16, 0
@@ -432,29 +465,31 @@ 110:
         bhs     110b
 .endm
 
 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
  .if dst_r_bpp > 0
         tst     DST, #16
         bne     111f
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
         b       112f
 111:
  .endif
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
 112:
         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
         PF  and,    WK0, X, #pix_per_block-1
  .endif
         preload_trailing  src_bpp, src_bpp_shift, SRC
         preload_trailing  mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
+ .endif
         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
         /* The remainder of the line is handled identically to the medium case */
         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
 .endm
 
 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 120:
         process_head  , 16, 0, unaligned_src, unaligned_mask, 0
@@ -556,23 +591,17 @@ 142:
                                    prefetch_distance_, \
                                    init, \
                                    newline, \
                                    cleanup, \
                                    process_head, \
                                    process_tail, \
                                    process_inner_loop
 
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
+    pixman_asm_function fname
 
 /*
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
  .set src_bpp, src_bpp_
  .set mask_bpp, mask_bpp_
  .set dst_w_bpp, dst_w_bpp_
@@ -674,26 +703,22 @@ 142:
     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
     WK0         .req    r8  /* pixel data registers */
     WK1         .req    r9
     WK2         .req    r10
     WK3         .req    r11
     SCRATCH     .req    r12
     ORIG_W      .req    r14 /* width (pixels) */
 
-fname:
-        .fnstart
-	.save   {r4-r11, lr}
         push    {r4-r11, lr}        /* save all registers */
 
         subs    Y, Y, #1
         blo     199f
 
 #ifdef DEBUG_PARAMS
-	.pad    #9*4
         sub     sp, sp, #9*4
 #endif
 
  .if src_bpp > 0
         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
  .endif
  .if mask_bpp > 0
@@ -703,16 +728,23 @@ fname:
         
 #ifdef DEBUG_PARAMS
         add     Y, Y, #1
         stmia   sp, {r0-r7,pc}
         sub     Y, Y, #1
 #endif
 
         init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        /* Reserve a word in which to store X during leading pixels */
+        sub     sp, sp, #4
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
         
         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
  .if src_bpp > 0
         lsl     STRIDE_S, #src_bpp_shift
         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
  .endif
  .if mask_bpp > 0
@@ -732,72 +764,83 @@ fname:
         /* Adjust X so that the decrement instruction can also test for
          * inner loop termination. We want it to stop when there are
          * (prefetch_distance+1) complete blocks to go. */
         sub     X, X, #(prefetch_distance+2)*pix_per_block
         mov     ORIG_W, X
   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
         /* This is stmdb sp!,{} */
         .word   0xE92D0000 | LINE_SAVED_REGS
+   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
   .endif
 151:    /* New line */
         newline
         preload_leading_step1  src_bpp, WK1, SRC
         preload_leading_step1  mask_bpp, WK2, MASK
+  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_leading_step1  dst_r_bpp, WK3, DST
+  .endif
         
-        tst     DST, #15
+        ands    WK0, DST, #15
         beq     154f
-        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
-  .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
-        PF  and,    WK0, WK0, #15
-  .endif
+        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
 
         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
+  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
+  .endif
 
         leading_15bytes  process_head, process_tail
         
 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
- .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
         and     SCRATCH, SRC, #31
         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
- .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
         and     SCRATCH, MASK, #31
         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
- .endif
- .ifc "process_inner_loop",""
+  .endif
+  .ifc "process_inner_loop",""
         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
- .else
+  .else
         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
- .endif
+  .endif
 
 157:    /* Check for another line */
         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+  .endif
  .endif
 
  .ltorg
 
 160:    /* Medium case */
         mov     ORIG_W, X
  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
         /* This is stmdb sp!,{} */
         .word   0xE92D0000 | LINE_SAVED_REGS
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
  .endif
 161:    /* New line */
         newline
         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
         
         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
-        tst     DST, #15
+        ands    WK0, DST, #15
         beq     164f
-        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
         
         leading_15bytes  process_head, process_tail
         
 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
         
 167:    /* Check for another line */
         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
@@ -811,17 +854,19 @@ 170:    /* Narrow case, less than 31 byt
  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
         /* This is stmdb sp!,{} */
         .word   0xE92D0000 | LINE_SAVED_REGS
  .endif
 171:    /* New line */
         newline
         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
         
  .if dst_w_bpp == 8
         tst     DST, #3
         beq     174f
 172:    subs    X, X, #1
         blo     177f
         process_head  , 1, 0, 1, 1, 0
         process_tail  , 1, 0
@@ -842,30 +887,39 @@ 172:    subs    X, X, #1
   .endif
  .endif
 
 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
 
 177:    /* Check for another line */
         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
 
 197:
  .if (flags) & FLAG_SPILL_LINE_VARS
         add     sp, sp, #LINE_SAVED_REG_COUNT*4
  .endif
 198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+        add     sp, sp, #4
+ .endif
+
         cleanup
 
 #ifdef DEBUG_PARAMS
         add     sp, sp, #9*4 /* junk the debug copy of arguments */
 #endif
 199:
         pop     {r4-r11, pc}  /* exit */
-	.fnend
 
  .ltorg
 
     .unreq  X
     .unreq  Y
     .unreq  DST
     .unreq  STRIDE_D
     .unreq  SRC
--- a/gfx/cairo/libpixman/src/pixman-arm-simd.c
+++ b/gfx/cairo/libpixman/src/pixman-arm-simd.c
@@ -36,28 +36,40 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
                                    uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
                                    uint16_t, 1, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
                                    uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_0565,
+                                   uint32_t, 1, uint16_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
+                                 uint32_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
                                      uint32_t, 1, uint32_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
                                         uint16_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
                                         uint32_t, uint32_t)
 
 void
 pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
                                        int32_t   h,
@@ -211,41 +223,63 @@ static const pixman_fast_path_t arm_simd
     PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
     PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
 
     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888),
     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888),
     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
 
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565),
+
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
 
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),
+
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
 
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888),
+
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
 
     { PIXMAN_OP_NONE },
 };
 
 pixman_implementation_t *
 _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
--- a/gfx/cairo/libpixman/src/pixman-arm.c
+++ b/gfx/cairo/libpixman/src/pixman-arm.c
@@ -91,43 +91,39 @@ detect_cpu_features (void)
     features |= ARM_NEON;
 #endif
 
     return features;
 }
 
 #elif defined(__ANDROID__) || defined(ANDROID) /* Android */
 
+#include <cpu-features.h>
+
 static arm_cpu_features_t
 detect_cpu_features (void)
 {
     arm_cpu_features_t features = 0;
-    char buf[1024];
-    char* pos;
-    const char* ver_token = "CPU architecture: ";
-    FILE* f = fopen("/proc/cpuinfo", "r");
-    if (!f) {
-	return features;
-    }
+    AndroidCpuFamily cpu_family;
+    uint64_t cpu_features;
+
+    cpu_family = android_getCpuFamily();
+    cpu_features = android_getCpuFeatures();
 
-    fread(buf, sizeof(char), sizeof(buf), f);
-    fclose(f);
-    pos = strstr(buf, ver_token);
-    if (pos) {
-	char vchar = *(pos + strlen(ver_token));
-	if (vchar >= '0' && vchar <= '9') {
-	    int ver = vchar - '0';
-	    if (ver >= 7)
-		features |= ARM_V7;
-	}
+    if (cpu_family == ANDROID_CPU_FAMILY_ARM)
+    {
+	if (cpu_features & ANDROID_CPU_ARM_FEATURE_ARMv7)
+	    features |= ARM_V7;
+
+	if (cpu_features & ANDROID_CPU_ARM_FEATURE_VFPv3)
+	    features |= ARM_VFP;
+
+	if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
+	    features |= ARM_NEON;
     }
-    if (strstr(buf, "neon") != NULL)
-	features |= ARM_NEON;
-    if (strstr(buf, "vfp") != NULL)
-	features |= ARM_VFP;
 
     return features;
 }
 
 #elif defined (__linux__) /* linux ELF */
 
 #include <unistd.h>
 #include <sys/types.h>
@@ -175,16 +171,41 @@ detect_cpu_features (void)
 	    }
 	}
 	close (fd);
     }
 
     return features;
 }
 
+#elif defined (_3DS) /* 3DS homebrew (devkitARM) */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+
+    features |= ARM_V6;
+
+    return features;
+}
+
+#elif defined (PSP2) || defined (__SWITCH__)
+/* Vita (VitaSDK) or Switch (devkitA64) homebrew */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+
+    features |= ARM_NEON;
+
+    return features;
+}
+
 #else /* Unknown */
 
 static arm_cpu_features_t
 detect_cpu_features (void)
 {
     return 0;
 }
 
--- a/gfx/cairo/libpixman/src/pixman-bits-image.c
+++ b/gfx/cairo/libpixman/src/pixman-bits-image.c
@@ -30,83 +30,88 @@
 #include <config.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include "pixman-inlines.h"
-
-static uint32_t *
-_pixman_image_get_scanline_generic_float (pixman_iter_t * iter,
-					  const uint32_t *mask)
-{
-    pixman_iter_get_scanline_t fetch_32 = iter->data;
-    uint32_t *buffer = iter->buffer;
-
-    fetch_32 (iter, NULL);
-
-    pixman_expand_to_float ((argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
-    return iter->buffer;
-}
+#include "dither/blue-noise-64x64.h"
 
 /* Fetch functions */
 
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
-		      int x, int y, pixman_bool_t check_bounds)
+static force_inline void
+fetch_pixel_no_alpha_32 (bits_image_t *image,
+			 int x, int y, pixman_bool_t check_bounds,
+			 void *out)
 {
+    uint32_t *ret = out;
+
     if (check_bounds &&
 	(x < 0 || x >= image->width || y < 0 || y >= image->height))
-    {
-	return 0;
-    }
-
-    return image->fetch_pixel_32 (image, x, y);
+	*ret = 0;
+    else
+	*ret = image->fetch_pixel_32 (image, x, y);
 }
 
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
-				  int x, int y, pixman_bool_t check_bounds);
+static force_inline void
+fetch_pixel_no_alpha_float (bits_image_t *image,
+			    int x, int y, pixman_bool_t check_bounds,
+			    void *out)
+{
+    argb_t *ret = out;
 
-static force_inline uint32_t
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+	ret->a = ret->r = ret->g = ret->b = 0.f;
+    else
+	*ret = image->fetch_pixel_float (image, x, y);
+}
+
+typedef void (* get_pixel_t) (bits_image_t *image,
+			      int x, int y, pixman_bool_t check_bounds, void *out);
+
+static force_inline void
 bits_image_fetch_pixel_nearest (bits_image_t   *image,
 				pixman_fixed_t  x,
 				pixman_fixed_t  y,
-				get_pixel_t	get_pixel)
+				get_pixel_t	get_pixel,
+				void	       *out)
 {
     int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
     int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
 
     if (image->common.repeat != PIXMAN_REPEAT_NONE)
     {
 	repeat (image->common.repeat, &x0, image->width);
 	repeat (image->common.repeat, &y0, image->height);
 
-	return get_pixel (image, x0, y0, FALSE);
+	get_pixel (image, x0, y0, FALSE, out);
     }
     else
     {
-	return get_pixel (image, x0, y0, TRUE);
+	get_pixel (image, x0, y0, TRUE, out);
     }
 }
 
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t   *image,
-				 pixman_fixed_t  x,
-				 pixman_fixed_t  y,
-				 get_pixel_t	 get_pixel)
+static force_inline void
+bits_image_fetch_pixel_bilinear_32 (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y,
+				    get_pixel_t	    get_pixel,
+				    void	   *out)
 {
     pixman_repeat_t repeat_mode = image->common.repeat;
     int width = image->width;
     int height = image->height;
     int x1, y1, x2, y2;
     uint32_t tl, tr, bl, br;
     int32_t distx, disty;
+    uint32_t *ret = out;
 
     x1 = x - pixman_fixed_1 / 2;
     y1 = y - pixman_fixed_1 / 2;
 
     distx = pixman_fixed_to_bilinear_weight (x1);
     disty = pixman_fixed_to_bilinear_weight (y1);
 
     x1 = pixman_fixed_to_int (x1);
@@ -116,263 +121,163 @@ bits_image_fetch_pixel_bilinear (bits_im
 
     if (repeat_mode != PIXMAN_REPEAT_NONE)
     {
 	repeat (repeat_mode, &x1, width);
 	repeat (repeat_mode, &y1, height);
 	repeat (repeat_mode, &x2, width);
 	repeat (repeat_mode, &y2, height);
 
-	tl = get_pixel (image, x1, y1, FALSE);
-	bl = get_pixel (image, x1, y2, FALSE);
-	tr = get_pixel (image, x2, y1, FALSE);
-	br = get_pixel (image, x2, y2, FALSE);
+	get_pixel (image, x1, y1, FALSE, &tl);
+	get_pixel (image, x2, y1, FALSE, &tr);
+	get_pixel (image, x1, y2, FALSE, &bl);
+	get_pixel (image, x2, y2, FALSE, &br);
     }
     else
     {
-	tl = get_pixel (image, x1, y1, TRUE);
-	tr = get_pixel (image, x2, y1, TRUE);
-	bl = get_pixel (image, x1, y2, TRUE);
-	br = get_pixel (image, x2, y2, TRUE);
+	get_pixel (image, x1, y1, TRUE, &tl);
+	get_pixel (image, x2, y1, TRUE, &tr);
+	get_pixel (image, x1, y2, TRUE, &bl);
+	get_pixel (image, x2, y2, TRUE, &br);
     }
 
-    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+    *ret = bilinear_interpolation (tl, tr, bl, br, distx, disty);
 }
 
-static uint32_t *
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
-					  const uint32_t *mask)
+static force_inline void
+bits_image_fetch_pixel_bilinear_float (bits_image_t   *image,
+				       pixman_fixed_t  x,
+				       pixman_fixed_t  y,
+				       get_pixel_t     get_pixel,
+				       void	      *out)
 {
-
-    pixman_image_t * ima = iter->image;
-    int              offset = iter->x;
-    int              line = iter->y++;
-    int              width = iter->width;
-    uint32_t *       buffer = iter->buffer;
-
-    bits_image_t *bits = &ima->bits;
-    pixman_fixed_t x_top, x_bottom, x;
-    pixman_fixed_t ux_top, ux_bottom, ux;
-    pixman_vector_t v;
-    uint32_t top_mask, bottom_mask;
-    uint32_t *top_row;
-    uint32_t *bottom_row;
-    uint32_t *end;
-    uint32_t zero[2] = { 0, 0 };
-    uint32_t one = 1;
-    int y, y1, y2;
-    int disty;
-    int mask_inc;
-    int w;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    argb_t tl, tr, bl, br;
+    float distx, disty;
+    argb_t *ret = out;
 
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (bits->common.transform, &v))
-	return iter->buffer;
-
-    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
-    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
 
-    y = v.vector[1] - pixman_fixed_1/2;
-    disty = pixman_fixed_to_bilinear_weight (y);
+    distx = ((float)pixman_fixed_fraction(x1)) / 65536.f;
+    disty = ((float)pixman_fixed_fraction(y1)) / 65536.f;
 
-    /* Load the pointers to the first and second lines from the source
-     * image that bilinear code must read.
-     *
-     * The main trick in this code is about the check if any line are
-     * outside of the image;
-     *
-     * When I realize that a line (any one) is outside, I change
-     * the pointer to a dummy area with zeros. Once I change this, I
-     * must be sure the pointer will not change, so I set the
-     * variables to each pointer increments inside the loop.
-     */
-    y1 = pixman_fixed_to_int (y);
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
     y2 = y1 + 1;
 
-    if (y1 < 0 || y1 >= bits->height)
-    {
-	top_row = zero;
-	x_top = 0;
-	ux_top = 0;
-    }
-    else
-    {
-	top_row = bits->bits + y1 * bits->rowstride;
-	x_top = x;
-	ux_top = ux;
-    }
-
-    if (y2 < 0 || y2 >= bits->height)
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
     {
-	bottom_row = zero;
-	x_bottom = 0;
-	ux_bottom = 0;
-    }
-    else
-    {
-	bottom_row = bits->bits + y2 * bits->rowstride;
-	x_bottom = x;
-	ux_bottom = ux;
-    }
+	repeat (repeat_mode, &x1, width);
+	repeat (repeat_mode, &y1, height);
+	repeat (repeat_mode, &x2, width);
+	repeat (repeat_mode, &y2, height);
 
-    /* Instead of checking whether the operation uses the mast in
-     * each loop iteration, verify this only once and prepare the
-     * variables to make the code smaller inside the loop.
-     */
-    if (!mask)
-    {
-        mask_inc = 0;
-        mask = &one;
+	get_pixel (image, x1, y1, FALSE, &tl);
+	get_pixel (image, x2, y1, FALSE, &tr);
+	get_pixel (image, x1, y2, FALSE, &bl);
+	get_pixel (image, x2, y2, FALSE, &br);
     }
     else
     {
-        /* If have a mask, prepare the variables to check it */
-        mask_inc = 1;
-    }
-
-    /* If both are zero, then the whole thing is zero */
-    if (top_row == zero && bottom_row == zero)
-    {
-	memset (buffer, 0, width * sizeof (uint32_t));
-	return iter->buffer;
-    }
-    else if (bits->format == PIXMAN_x8r8g8b8)
-    {
-	if (top_row == zero)
-	{
-	    top_mask = 0;
-	    bottom_mask = 0xff000000;
-	}
-	else if (bottom_row == zero)
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0;
-	}
-	else
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0xff000000;
-	}
-    }
-    else
-    {
-	top_mask = 0;
-	bottom_mask = 0;
-    }
-
-    end = buffer + width;
-
-    /* Zero fill to the left of the image */
-    while (buffer < end && x < pixman_fixed_minus_1)
-    {
-	*buffer++ = 0;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
+	get_pixel (image, x1, y1, TRUE, &tl);
+	get_pixel (image, x2, y1, TRUE, &tr);
+	get_pixel (image, x1, y2, TRUE, &bl);
+	get_pixel (image, x2, y2, TRUE, &br);
     }
 
-    /* Left edge
-     */
-    while (buffer < end && x < 0)
-    {
-	uint32_t tr, br;
-	int32_t distx;
-
-	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
-	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	distx = pixman_fixed_to_bilinear_weight (x);
-
-	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+    *ret = bilinear_interpolation_float (tl, tr, bl, br, distx, disty);
+}
 
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Main part */
-    w = pixman_int_to_fixed (bits->width - 1);
+static force_inline void accum_32(unsigned int *satot, unsigned int *srtot,
+				  unsigned int *sgtot, unsigned int *sbtot,
+				  const void *p, pixman_fixed_t f)
+{
+    uint32_t pixel = *(uint32_t *)p;
 
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, tr, bl, br;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	    distx = pixman_fixed_to_bilinear_weight (x);
+    *srtot += (int)RED_8 (pixel) * f;
+    *sgtot += (int)GREEN_8 (pixel) * f;
+    *sbtot += (int)BLUE_8 (pixel) * f;
+    *satot += (int)ALPHA_8 (pixel) * f;
+}
 
-	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Right Edge */
-    w = pixman_int_to_fixed (bits->width);
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, bl;
-	    int32_t distx;
+static force_inline void reduce_32(unsigned int satot, unsigned int srtot,
+				   unsigned int sgtot, unsigned int sbtot,
+                                   void *p)
+{
+    uint32_t *ret = p;
 
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
-	    distx = pixman_fixed_to_bilinear_weight (x);
-
-	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
-	}
+    satot = (satot + 0x8000) >> 16;
+    srtot = (srtot + 0x8000) >> 16;
+    sgtot = (sgtot + 0x8000) >> 16;
+    sbtot = (sbtot + 0x8000) >> 16;
 
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
 
-    /* Zero fill to the left of the image */
-    while (buffer < end)
-	*buffer++ = 0;
-
-    return iter->buffer;
+    *ret = ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
 }
 
-static force_inline uint32_t
+static force_inline void accum_float(unsigned int *satot, unsigned int *srtot,
+				     unsigned int *sgtot, unsigned int *sbtot,
+				     const void *p, pixman_fixed_t f)
+{
+    const argb_t *pixel = p;
+
+    *satot += pixel->a * f;
+    *srtot += pixel->r * f;
+    *sgtot += pixel->g * f;
+    *sbtot += pixel->b * f;
+}
+
+static force_inline void reduce_float(unsigned int satot, unsigned int srtot,
+				      unsigned int sgtot, unsigned int sbtot,
+				      void *p)
+{
+    argb_t *ret = p;
+
+    ret->a = CLIP (satot / 65536.f, 0.f, 1.f);
+    ret->r = CLIP (srtot / 65536.f, 0.f, 1.f);
+    ret->g = CLIP (sgtot / 65536.f, 0.f, 1.f);
+    ret->b = CLIP (sbtot / 65536.f, 0.f, 1.f);
+}
+
+typedef void (* accumulate_pixel_t) (unsigned int *satot, unsigned int *srtot,
+				     unsigned int *sgtot, unsigned int *sbtot,
+				     const void *pixel, pixman_fixed_t f);
+
+typedef void (* reduce_pixel_t) (unsigned int satot, unsigned int srtot,
+				 unsigned int sgtot, unsigned int sbtot,
+                                 void *out);
+
+static force_inline void
 bits_image_fetch_pixel_convolution (bits_image_t   *image,
 				    pixman_fixed_t  x,
 				    pixman_fixed_t  y,
-				    get_pixel_t     get_pixel)
+				    get_pixel_t     get_pixel,
+				    void	      *out,
+				    accumulate_pixel_t accum,
+				    reduce_pixel_t reduce)
 {
     pixman_fixed_t *params = image->common.filter_params;
     int x_off = (params[0] - pixman_fixed_1) >> 1;
     int y_off = (params[1] - pixman_fixed_1) >> 1;
     int32_t cwidth = pixman_fixed_to_int (params[0]);
     int32_t cheight = pixman_fixed_to_int (params[1]);
     int32_t i, j, x1, x2, y1, y2;
     pixman_repeat_t repeat_mode = image->common.repeat;
     int width = image->width;
     int height = image->height;
-    int srtot, sgtot, sbtot, satot;
+    unsigned int srtot, sgtot, sbtot, satot;
 
     params += 2;
 
     x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
     y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
     x2 = x1 + cwidth;
     y2 = y1 + cheight;
 
@@ -384,73 +289,64 @@ bits_image_fetch_pixel_convolution (bits
 	{
 	    int rx = j;
 	    int ry = i;
 
 	    pixman_fixed_t f = *params;
 
 	    if (f)
 	    {
-		uint32_t pixel;
+		/* Must be big enough to hold a argb_t */
+		argb_t pixel;
 
 		if (repeat_mode != PIXMAN_REPEAT_NONE)
 		{
 		    repeat (repeat_mode, &rx, width);
 		    repeat (repeat_mode, &ry, height);
 
-		    pixel = get_pixel (image, rx, ry, FALSE);
+		    get_pixel (image, rx, ry, FALSE, &pixel);
 		}
 		else
 		{
-		    pixel = get_pixel (image, rx, ry, TRUE);
+		    get_pixel (image, rx, ry, TRUE, &pixel);
 		}
 
-		srtot += (int)RED_8 (pixel) * f;
-		sgtot += (int)GREEN_8 (pixel) * f;
-		sbtot += (int)BLUE_8 (pixel) * f;
-		satot += (int)ALPHA_8 (pixel) * f;
+		accum (&satot, &srtot, &sgtot, &sbtot, &pixel, f);
 	    }
 
 	    params++;
 	}
     }
 
-    satot = (satot + 0x8000) >> 16;
-    srtot = (srtot + 0x8000) >> 16;
-    sgtot = (sgtot + 0x8000) >> 16;
-    sbtot = (sbtot + 0x8000) >> 16;
-
-    satot = CLIP (satot, 0, 0xff);
-    srtot = CLIP (srtot, 0, 0xff);
-    sgtot = CLIP (sgtot, 0, 0xff);
-    sbtot = CLIP (sbtot, 0, 0xff);
-
-    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+    reduce (satot, srtot, sgtot, sbtot, out);
 }
 
-static uint32_t
-bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
-                                              pixman_fixed_t x,
-                                              pixman_fixed_t y,
-                                              get_pixel_t    get_pixel)
+static void
+bits_image_fetch_pixel_separable_convolution (bits_image_t  *image,
+					      pixman_fixed_t x,
+					      pixman_fixed_t y,
+					      get_pixel_t    get_pixel,
+					      void	    *out,
+					      accumulate_pixel_t accum,
+					      reduce_pixel_t     reduce)
 {
     pixman_fixed_t *params = image->common.filter_params;
     pixman_repeat_t repeat_mode = image->common.repeat;
     int width = image->width;
     int height = image->height;
     int cwidth = pixman_fixed_to_int (params[0]);
     int cheight = pixman_fixed_to_int (params[1]);
     int x_phase_bits = pixman_fixed_to_int (params[2]);
     int y_phase_bits = pixman_fixed_to_int (params[3]);
     int x_phase_shift = 16 - x_phase_bits;
     int y_phase_shift = 16 - y_phase_bits;
     int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
     int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
     pixman_fixed_t *y_params;
-    int srtot, sgtot, sbtot, satot;
+    unsigned int srtot, sgtot, sbtot, satot;
     int32_t x1, x2, y1, y2;
     int32_t px, py;
     int i, j;
 
     /* Round x and y to the middle of the closest phase before continuing. This
      * ensures that the convolution matrix is aligned right, since it was
      * positioned relative to a particular phase (and not relative to whatever
      * exact fraction we happen to get here).
@@ -480,103 +376,123 @@ bits_image_fetch_pixel_separable_convolu
             for (j = x1; j < x2; ++j)
             {
                 pixman_fixed_t fx = *x_params++;
 		int rx = j;
 		int ry = i;
 
                 if (fx)
                 {
+                    /* Must be big enough to hold a argb_t */
+                    argb_t pixel;
                     pixman_fixed_t f;
-                    uint32_t pixel;
 
                     if (repeat_mode != PIXMAN_REPEAT_NONE)
                     {
                         repeat (repeat_mode, &rx, width);
                         repeat (repeat_mode, &ry, height);
 
-                        pixel = get_pixel (image, rx, ry, FALSE);
+                        get_pixel (image, rx, ry, FALSE, &pixel);
                     }
                     else
                     {
-                        pixel = get_pixel (image, rx, ry, TRUE);
+                        get_pixel (image, rx, ry, TRUE, &pixel);
 		    }
 
                     f = (fy * fx + 0x8000) >> 16;
 
-                    srtot += (int)RED_8 (pixel) * f;
-                    sgtot += (int)GREEN_8 (pixel) * f;
-                    sbtot += (int)BLUE_8 (pixel) * f;
-                    satot += (int)ALPHA_8 (pixel) * f;
+		    accum(&satot, &srtot, &sgtot, &sbtot, &pixel, f);
                 }
             }
 	}
     }
 
-    satot = (satot + 0x8000) >> 16;
-    srtot = (srtot + 0x8000) >> 16;
-    sgtot = (sgtot + 0x8000) >> 16;
-    sbtot = (sbtot + 0x8000) >> 16;
 
-    satot = CLIP (satot, 0, 0xff);
-    srtot = CLIP (srtot, 0, 0xff);
-    sgtot = CLIP (sgtot, 0, 0xff);
-    sbtot = CLIP (sbtot, 0, 0xff);
-
-    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+    reduce(satot, srtot, sgtot, sbtot, out);
 }
 
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
+static force_inline void
+bits_image_fetch_pixel_filtered (bits_image_t  *image,
+				 pixman_bool_t  wide,
 				 pixman_fixed_t x,
 				 pixman_fixed_t y,
-				 get_pixel_t    get_pixel)
+				 get_pixel_t    get_pixel,
+				 void          *out)
 {
     switch (image->common.filter)
     {
     case PIXMAN_FILTER_NEAREST:
     case PIXMAN_FILTER_FAST:
-	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+	bits_image_fetch_pixel_nearest (image, x, y, get_pixel, out);
 	break;
 
     case PIXMAN_FILTER_BILINEAR:
     case PIXMAN_FILTER_GOOD:
     case PIXMAN_FILTER_BEST:
-	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+	if (wide)
+	    bits_image_fetch_pixel_bilinear_float (image, x, y, get_pixel, out);
+	else
+	    bits_image_fetch_pixel_bilinear_32 (image, x, y, get_pixel, out);
 	break;
 
     case PIXMAN_FILTER_CONVOLUTION:
-	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+	if (wide)
+	{
+	    bits_image_fetch_pixel_convolution (image, x, y,
+						get_pixel, out,
+						accum_float,
+						reduce_float);
+	}
+	else
+	{
+	    bits_image_fetch_pixel_convolution (image, x, y,
+						get_pixel, out,
+						accum_32, reduce_32);
+	}
 	break;
 
     case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
-        return bits_image_fetch_pixel_separable_convolution (image, x, y, get_pixel);
+	if (wide)
+	{
+	    bits_image_fetch_pixel_separable_convolution (image, x, y,
+							  get_pixel, out,
+							  accum_float,
+							  reduce_float);
+	}
+	else
+	{
+	    bits_image_fetch_pixel_separable_convolution (image, x, y,
+							  get_pixel, out,
+							  accum_32, reduce_32);
+	}
         break;
 
     default:
+	assert (0);
         break;
     }
-
-    return 0;
 }
 
 static uint32_t *
-bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
-				  const uint32_t * mask)
+__bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
+				    pixman_bool_t    wide,
+				    const uint32_t * mask)
 {
     pixman_image_t *image  = iter->image;
     int             offset = iter->x;
     int             line   = iter->y++;
     int             width  = iter->width;
     uint32_t *      buffer = iter->buffer;
 
     pixman_fixed_t x, y;
     pixman_fixed_t ux, uy;
     pixman_vector_t v;
     int i;
+    get_pixel_t get_pixel =
+	wide ? fetch_pixel_no_alpha_float : fetch_pixel_no_alpha_32;
 
     /* reference point is the center of the pixel */
     v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
     v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
     v.vector[2] = pixman_fixed_1;
 
     if (image->common.transform)
     {
@@ -594,37 +510,55 @@ bits_image_fetch_affine_no_alpha (pixman
 
     x = v.vector[0];
     y = v.vector[1];
 
     for (i = 0; i < width; ++i)
     {
 	if (!mask || mask[i])
 	{
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x, y, fetch_pixel_no_alpha);
+	    bits_image_fetch_pixel_filtered (
+		&image->bits, wide, x, y, get_pixel, buffer);
 	}
 
 	x += ux;
 	y += uy;
+	buffer += wide ? 4 : 1;
     }
 
-    return buffer;
+    return iter->buffer;
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha_32 (pixman_iter_t  *iter,
+				     const uint32_t *mask)
+{
+    return __bits_image_fetch_affine_no_alpha(iter, FALSE, mask);
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha_float (pixman_iter_t  *iter,
+					const uint32_t *mask)
+{
+    return __bits_image_fetch_affine_no_alpha(iter, TRUE, mask);
 }
 
 /* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+static force_inline void
+fetch_pixel_general_32 (bits_image_t *image,
+			int x, int y, pixman_bool_t check_bounds,
+			void *out)
 {
-    uint32_t pixel;
+    uint32_t pixel, *ret = out;
 
     if (check_bounds &&
 	(x < 0 || x >= image->width || y < 0 || y >= image->height))
     {
-	return 0;
+	*ret = 0;
+	return;
     }
 
     pixel = image->fetch_pixel_32 (image, x, y);
 
     if (image->common.alpha_map)
     {
 	uint32_t pixel_a;
 
@@ -643,28 +577,69 @@ fetch_pixel_general (bits_image_t *image
 
 	    pixel_a = ALPHA_8 (pixel_a);
 	}
 
 	pixel &= 0x00ffffff;
 	pixel |= (pixel_a << 24);
     }
 
-    return pixel;
+    *ret = pixel;
+}
+
+static force_inline void
+fetch_pixel_general_float (bits_image_t *image,
+			int x, int y, pixman_bool_t check_bounds,
+			void *out)
+{
+    argb_t *ret = out;
+
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	ret->a = ret->r = ret->g = ret->b = 0;
+	return;
+    }
+
+    *ret = image->fetch_pixel_float (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	if (x < 0 || x >= image->common.alpha_map->width ||
+	    y < 0 || y >= image->common.alpha_map->height)
+	{
+	    ret->a = 0.f;
+	}
+	else
+	{
+	    argb_t alpha;
+
+	    alpha = image->common.alpha_map->fetch_pixel_float (
+		    image->common.alpha_map, x, y);
+
+	    ret->a = alpha.a;
+	}
+    }
 }
 
 static uint32_t *
-bits_image_fetch_general (pixman_iter_t  *iter,
-			  const uint32_t *mask)
+__bits_image_fetch_general (pixman_iter_t  *iter,
+			    pixman_bool_t wide,
+			    const uint32_t *mask)
 {
     pixman_image_t *image  = iter->image;
     int             offset = iter->x;
     int             line   = iter->y++;
     int             width  = iter->width;
     uint32_t *      buffer = iter->buffer;
+    get_pixel_t     get_pixel =
+	wide ? fetch_pixel_general_float : fetch_pixel_general_32;
 
     pixman_fixed_t x, y, w;
     pixman_fixed_t ux, uy, uw;
     pixman_vector_t v;
     int i;
 
     /* reference point is the center of the pixel */
     v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
@@ -694,503 +669,52 @@ bits_image_fetch_general (pixman_iter_t 
     for (i = 0; i < width; ++i)
     {
 	pixman_fixed_t x0, y0;
 
 	if (!mask || mask[i])
 	{
 	    if (w != 0)
 	    {
-		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
-		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+		x0 = ((uint64_t)x << 16) / w;
+		y0 = ((uint64_t)y << 16) / w;
 	    }
 	    else
 	    {
 		x0 = 0;
 		y0 = 0;
 	    }
 
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x0, y0, fetch_pixel_general);
+	    bits_image_fetch_pixel_filtered (
+		&image->bits, wide, x0, y0, get_pixel, buffer);
 	}
 
 	x += ux;
 	y += uy;
 	w += uw;
+	buffer += wide ? 4 : 1;
     }
 
-    return buffer;
-}
-
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
-					       int              offset,
-					       int              line,
-					       int              width,
-					       uint32_t *       buffer,
-					       const uint32_t * mask,
-
-					       convert_pixel_t	convert_pixel,
-					       pixman_format_code_t	format,
-					       pixman_repeat_t	repeat_mode)
-{
-    bits_image_t *bits = &image->bits;
-    pixman_fixed_t *params = image->common.filter_params;
-    int cwidth = pixman_fixed_to_int (params[0]);
-    int cheight = pixman_fixed_to_int (params[1]);
-    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
-    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
-    int x_phase_bits = pixman_fixed_to_int (params[2]);
-    int y_phase_bits = pixman_fixed_to_int (params[3]);
-    int x_phase_shift = 16 - x_phase_bits;
-    int y_phase_shift = 16 - y_phase_bits;
-    pixman_fixed_t vx, vy;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    int k;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    vx = v.vector[0];
-    vy = v.vector[1];
-
-    for (k = 0; k < width; ++k)
-    {
-	pixman_fixed_t *y_params;
-	int satot, srtot, sgtot, sbtot;
-	pixman_fixed_t x, y;
-	int32_t x1, x2, y1, y2;
-	int32_t px, py;
-	int i, j;
-
-	if (mask && !mask[k])
-	    goto next;
-
-	/* Round x and y to the middle of the closest phase before continuing. This
-	 * ensures that the convolution matrix is aligned right, since it was
-	 * positioned relative to a particular phase (and not relative to whatever
-	 * exact fraction we happen to get here).
-	 */
-	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
-	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
-
-	px = (x & 0xffff) >> x_phase_shift;
-	py = (y & 0xffff) >> y_phase_shift;
-
-	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
-	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
-	x2 = x1 + cwidth;
-	y2 = y1 + cheight;
-
-	satot = srtot = sgtot = sbtot = 0;
-
-	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
-
-	for (i = y1; i < y2; ++i)
-	{
-	    pixman_fixed_t fy = *y_params++;
-
-	    if (fy)
-	    {
-		pixman_fixed_t *x_params = params + 4 + px * cwidth;
-
-		for (j = x1; j < x2; ++j)
-		{
-		    pixman_fixed_t fx = *x_params++;
-		    int rx = j;
-		    int ry = i;
-		    
-		    if (fx)
-		    {
-			pixman_fixed_t f;
-			uint32_t pixel, mask;
-			uint8_t *row;
-
-			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-			if (repeat_mode != PIXMAN_REPEAT_NONE)
-			{
-			    repeat (repeat_mode, &rx, bits->width);
-			    repeat (repeat_mode, &ry, bits->height);
-
-			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
-			    pixel = convert_pixel (row, rx) | mask;
-			}
-			else
-			{
-			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
-			    {
-				pixel = 0;
-			    }
-			    else
-			    {
-				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
-				pixel = convert_pixel (row, rx) | mask;
-			    }
-			}
-
-			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
-			srtot += (int)RED_8 (pixel) * f;
-			sgtot += (int)GREEN_8 (pixel) * f;
-			sbtot += (int)BLUE_8 (pixel) * f;
-			satot += (int)ALPHA_8 (pixel) * f;
-		    }
-		}
-	    }
-	}
-
-	satot = (satot + 0x8000) >> 16;
-	srtot = (srtot + 0x8000) >> 16;
-	sgtot = (sgtot + 0x8000) >> 16;
-	sbtot = (sbtot + 0x8000) >> 16;
-
-	satot = CLIP (satot, 0, 0xff);
-	srtot = CLIP (srtot, 0, 0xff);
-	sgtot = CLIP (sgtot, 0, 0xff);
-	sbtot = CLIP (sbtot, 0, 0xff);
-
-	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
-
-    next:
-	vx += ux;
-	vy += uy;
-    }
+    return iter->buffer;
 }
 
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
-				  const uint32_t * mask,
-
-				  convert_pixel_t	convert_pixel,
-				  pixman_format_code_t	format,
-				  pixman_repeat_t	repeat_mode)
+static uint32_t *
+bits_image_fetch_general_32 (pixman_iter_t  *iter,
+			     const uint32_t *mask)
 {
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int x1, y1, x2, y2;
-	uint32_t tl, tr, bl, br;
-	int32_t distx, disty;
-	int width = image->bits.width;
-	int height = image->bits.height;
-	const uint8_t *row1;
-	const uint8_t *row2;
-
-	if (mask && !mask[i])
-	    goto next;
-
-	x1 = x - pixman_fixed_1 / 2;
-	y1 = y - pixman_fixed_1 / 2;
-
-	distx = pixman_fixed_to_bilinear_weight (x1);
-	disty = pixman_fixed_to_bilinear_weight (y1);
-
-	y1 = pixman_fixed_to_int (y1);
-	y2 = y1 + 1;
-	x1 = pixman_fixed_to_int (x1);
-	x2 = x1 + 1;
-
-	if (repeat_mode != PIXMAN_REPEAT_NONE)
-	{
-	    uint32_t mask;
-
-	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    repeat (repeat_mode, &x1, width);
-	    repeat (repeat_mode, &y1, height);
-	    repeat (repeat_mode, &x2, width);
-	    repeat (repeat_mode, &y2, height);
-
-	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
-	    tl = convert_pixel (row1, x1) | mask;
-	    tr = convert_pixel (row1, x2) | mask;
-	    bl = convert_pixel (row2, x1) | mask;
-	    br = convert_pixel (row2, x2) | mask;
-	}
-	else
-	{
-	    uint32_t mask1, mask2;
-	    int bpp;
-
-	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
-	     * which means if you use it in expressions, those
-	     * expressions become unsigned themselves. Since
-	     * the variables below can be negative in some cases,
-	     * that will lead to crashes on 64 bit architectures.
-	     *
-	     * So this line makes sure bpp is signed
-	     */
-	    bpp = PIXMAN_FORMAT_BPP (format);
-
-	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
-	    {
-		buffer[i] = 0;
-		goto next;
-	    }
-
-	    if (y2 == 0)
-	    {
-		row1 = zero;
-		mask1 = 0;
-	    }
-	    else
-	    {
-		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-		row1 += bpp / 8 * x1;
-
-		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (y1 == height - 1)
-	    {
-		row2 = zero;
-		mask2 = 0;
-	    }
-	    else
-	    {
-		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-		row2 += bpp / 8 * x1;
-
-		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (x2 == 0)
-	    {
-		tl = 0;
-		bl = 0;
-	    }
-	    else
-	    {
-		tl = convert_pixel (row1, 0) | mask1;
-		bl = convert_pixel (row2, 0) | mask2;
-	    }
-
-	    if (x1 == width - 1)
-	    {
-		tr = 0;
-		br = 0;
-	    }
-	    else
-	    {
-		tr = convert_pixel (row1, 1) | mask1;
-		br = convert_pixel (row2, 1) | mask2;
-	    }
-	}
-
-	buffer[i] = bilinear_interpolation (
-	    tl, tr, bl, br, distx, disty);
-
-    next:
-	x += ux;
-	y += uy;
-    }
+    return __bits_image_fetch_general(iter, FALSE, mask);
 }
 
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
-				 int              offset,
-				 int              line,
-				 int              width,
-				 uint32_t *       buffer,
-				 const uint32_t * mask,
-				 
-				 convert_pixel_t	convert_pixel,
-				 pixman_format_code_t	format,
-				 pixman_repeat_t	repeat_mode)
+static uint32_t *
+bits_image_fetch_general_float (pixman_iter_t  *iter,
+				const uint32_t *mask)
 {
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int width, height, x0, y0;
-	const uint8_t *row;
-
-	if (mask && !mask[i])
-	    goto next;
-	
-	width = image->bits.width;
-	height = image->bits.height;
-	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
-	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
-	if (repeat_mode == PIXMAN_REPEAT_NONE &&
-	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
-	{
-	    buffer[i] = 0;
-	}
-	else
-	{
-	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    if (repeat_mode != PIXMAN_REPEAT_NONE)
-	    {
-		repeat (repeat_mode, &x0, width);
-		repeat (repeat_mode, &y0, height);
-	    }
-
-	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
-	    buffer[i] = convert_pixel (row, x0) | mask;
-	}
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
+    return __bits_image_fetch_general(iter, TRUE, mask);
 }
 
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
-    return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
-    return convert_0565_to_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
-    static uint32_t *							\
-    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
-							    const uint32_t * mask) \
-    {									\
-	bits_image_fetch_separable_convolution_affine (                 \
-	    iter->image,                                                \
-	    iter->x, iter->y++,                                         \
-	    iter->width,                                                \
-	    iter->buffer, mask,                                         \
-	    convert_ ## format,                                         \
-	    PIXMAN_ ## format,                                          \
-	    repeat_mode);                                               \
-									\
-	return iter->buffer;                                            \
-    }
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
-    static uint32_t *							\
-    bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
-					       const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_bilinear_affine (iter->image,			\
-					  iter->x, iter->y++,		\
-					  iter->width,			\
-					  iter->buffer, mask,		\
-					  convert_ ## format,		\
-					  PIXMAN_ ## format,		\
-					  repeat_mode);			\
-	return iter->buffer;						\
-    }
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
-    static uint32_t *							\
-    bits_image_fetch_nearest_affine_ ## name (pixman_iter_t   *iter,	\
-					      const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_nearest_affine (iter->image,			\
-					 iter->x, iter->y++,		\
-					 iter->width,			\
-					 iter->buffer, mask,		\
-					 convert_ ## format,		\
-					 PIXMAN_ ## format,		\
-					 repeat_mode);			\
-	return iter->buffer;						\
-    }
-
-#define MAKE_FETCHERS(name, format, repeat_mode)			\
-    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
-    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
-    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
-
-MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
-
 static void
 replicate_pixel_32 (bits_image_t *   bits,
 		    int              x,
 		    int              y,
 		    int              width,
 		    uint32_t *       buffer)
 {
     uint32_t color;
@@ -1248,19 +772,19 @@ bits_image_fetch_untransformed_repeat_no
 	x += w;
     }
 
     if (x < image->width)
     {
 	w = MIN (width, image->width - x);
 
 	if (wide)
-	    image->fetch_scanline_float ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_float (image, x, y, w, buffer, NULL);
 	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL);
 
 	width -= w;
 	buffer += w * (wide? 4 : 1);
 	x += w;
     }
 
     memset (buffer, 0, width * (wide ? sizeof (argb_t) : 4));
 }
@@ -1296,19 +820,19 @@ bits_image_fetch_untransformed_repeat_no
 	while (x < 0)
 	    x += image->width;
 	while (x >= image->width)
 	    x -= image->width;
 
 	w = MIN (width, image->width - x);
 
 	if (wide)
-	    image->fetch_scanline_float ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_float (image, x, y, w, buffer, NULL);
 	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL);
 
 	buffer += w * (wide? 4 : 1);
 	x += w;
 	width -= w;
     }
 }
 
 static uint32_t *
@@ -1376,114 +900,28 @@ static const fetcher_info_t fetcher_info
        FAST_PATH_ID_TRANSFORM			|
        FAST_PATH_NO_CONVOLUTION_FILTER		|
        FAST_PATH_NO_PAD_REPEAT			|
        FAST_PATH_NO_REFLECT_REPEAT),
       bits_image_fetch_untransformed_32,
       bits_image_fetch_untransformed_float
     },
 
-#define FAST_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_X_UNIT_POSITIVE		|				\
-     FAST_PATH_Y_UNIT_ZERO		|				\
-     FAST_PATH_NONE_REPEAT		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-    { PIXMAN_a8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_float
-    },
-
-    { PIXMAN_x8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_float
-    },
-
-#define GENERAL_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_NEAREST_FILTER)
-
-#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
-    (FAST_PATH_NO_ALPHA_MAP            |				\
-     FAST_PATH_NO_ACCESSORS            |				\
-     FAST_PATH_HAS_TRANSFORM           |				\
-     FAST_PATH_AFFINE_TRANSFORM        |				\
-     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
-    
-#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
-    { PIXMAN_ ## format,                                               \
-      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
-      bits_image_fetch_separable_convolution_affine_ ## name,          \
-      _pixman_image_get_scanline_generic_float			       \
-    },
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_bilinear_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_float				\
-    },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_nearest_affine_ ## name,				\
-      _pixman_image_get_scanline_generic_float				\
-    },
-
-#define AFFINE_FAST_PATHS(name, format, repeat)				\
-    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
-    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-    
-    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
-    AFFINE_FAST_PATHS (none_a8, a8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
-    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
-    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
-    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
-    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
     /* Affine, no alpha */
     { PIXMAN_any,
       (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
-      bits_image_fetch_affine_no_alpha,
-      _pixman_image_get_scanline_generic_float
+      bits_image_fetch_affine_no_alpha_32,
+      bits_image_fetch_affine_no_alpha_float,
     },
 
     /* General */
     { PIXMAN_any,
       0,
-      bits_image_fetch_general,
-      _pixman_image_get_scanline_generic_float
+      bits_image_fetch_general_32,
+      bits_image_fetch_general_float,
     },
 
     { PIXMAN_null },
 };
 
 static void
 bits_image_property_changed (pixman_image_t *image)
 {
@@ -1503,65 +941,49 @@ void
 	    (info->flags & flags) == info->flags)
 	{
 	    if (iter->iter_flags & ITER_NARROW)
 	    {
 		iter->get_scanline = info->get_scanline_32;
 	    }
 	    else
 	    {
-		iter->data = info->get_scanline_32;
 		iter->get_scanline = info->get_scanline_float;
 	    }
 	    return;
 	}
     }
 
     /* Just in case we somehow didn't find a scanline function */
     iter->get_scanline = _pixman_iter_get_scanline_noop;
 }
 
 static uint32_t *
-dest_get_scanline_16 (pixman_iter_t *iter, const uint32_t *mask)
-{
-    pixman_image_t *image  = iter->image;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    uint32_t *	    buffer = iter->buffer;
-
-    image->bits.fetch_scanline_16 (image, x, y, width, buffer, mask);
-
-    return iter->buffer;
-}
-
-static uint32_t *
 dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 {
     pixman_image_t *image  = iter->image;
     int             x      = iter->x;
     int             y      = iter->y;
     int             width  = iter->width;
     uint32_t *	    buffer = iter->buffer;
 
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    image->bits.fetch_scanline_32 (&image->bits, x, y, width, buffer, mask);
     if (image->common.alpha_map)
     {
 	uint32_t *alpha;
 
 	if ((alpha = malloc (width * sizeof (uint32_t))))
 	{
 	    int i;
 
 	    x -= image->common.alpha_origin_x;
 	    y -= image->common.alpha_origin_y;
 
 	    image->common.alpha_map->fetch_scanline_32 (
-		(pixman_image_t *)image->common.alpha_map,
-		x, y, width, alpha, mask);
+		image->common.alpha_map, x, y, width, alpha, mask);
 
 	    for (i = 0; i < width; ++i)
 	    {
 		buffer[i] &= ~0xff000000;
 		buffer[i] |= (alpha[i] & 0xff000000);
 	    }
 
 	    free (alpha);
@@ -1576,57 +998,42 @@ dest_get_scanline_wide (pixman_iter_t *i
 {
     bits_image_t *  image  = &iter->image->bits;
     int             x      = iter->x;
     int             y      = iter->y;
     int             width  = iter->width;
     argb_t *	    buffer = (argb_t *)iter->buffer;
 
     image->fetch_scanline_float (
-	(pixman_image_t *)image, x, y, width, (uint32_t *)buffer, mask);
+	image, x, y, width, (uint32_t *)buffer, mask);
     if (image->common.alpha_map)
     {
 	argb_t *alpha;
 
 	if ((alpha = malloc (width * sizeof (argb_t))))
 	{
 	    int i;
 
 	    x -= image->common.alpha_origin_x;
 	    y -= image->common.alpha_origin_y;
 
 	    image->common.alpha_map->fetch_scanline_float (
-		(pixman_image_t *)image->common.alpha_map,
-		x, y, width, (uint32_t *)alpha, mask);
+		image->common.alpha_map, x, y, width, (uint32_t *)alpha, mask);
 
 	    for (i = 0; i < width; ++i)
 		buffer[i].a = alpha[i].a;
 
 	    free (alpha);
 	}
     }
 
     return iter->buffer;
 }
 
 static void
-dest_write_back_16 (pixman_iter_t *iter)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    const uint32_t *buffer = iter->buffer;
-
-    image->store_scanline_16 (image, x, y, width, buffer);
-
-    iter->y++;
-}
-
-static void
 dest_write_back_narrow (pixman_iter_t *iter)
 {
     bits_image_t *  image  = &iter->image->bits;
     int             x      = iter->x;
     int             y      = iter->y;
     int             width  = iter->width;
     const uint32_t *buffer = iter->buffer;
 
@@ -1639,25 +1046,155 @@ dest_write_back_narrow (pixman_iter_t *i
 
 	image->common.alpha_map->store_scanline_32 (
 	    image->common.alpha_map, x, y, width, buffer);
     }
 
     iter->y++;
 }
 
+static const float
+dither_factor_blue_noise_64 (int x, int y)
+{
+    float m = dither_blue_noise_64x64[((y & 0x3f) << 6) | (x & 0x3f)];
+    return m * (1. / 4096.f) + (1. / 8192.f);
+}
+
+static const float
+dither_factor_bayer_8 (int x, int y)
+{
+    uint32_t m;
+
+    y ^= x;
+
+    /* Compute reverse(interleave(xor(x mod n, y mod n), x mod n))
+     * Here n = 8 and `mod n` is the bottom 3 bits.
+     */
+    m = ((y & 0x1) << 5) | ((x & 0x1) << 4) |
+	((y & 0x2) << 2) | ((x & 0x2) << 1) |
+	((y & 0x4) >> 1) | ((x & 0x4) >> 2);
+
+    /* m is in range [0, 63].  We scale it to [0, 63.0f/64.0f], then
+     * shift it to to [1.0f/128.0f, 127.0f/128.0f] so that 0 < d < 1.
+     * This ensures exact values are not changed by dithering.
+     */
+    return (float)(m) * (1 / 64.0f) + (1.0f / 128.0f);
+}
+
+typedef float (* dither_factor_t)(int x, int y);
+
+static force_inline float
+dither_apply_channel (float f, float d, float s)
+{
+    /* float_to_unorm splits the [0, 1] segment in (1 << n_bits)
+     * subsections of equal length; however unorm_to_float does not
+     * map to the center of those sections.  In fact, pixel value u is
+     * mapped to:
+     *
+     *       u              u              u               1
+     * -------------- = ---------- + -------------- * ----------
+     *  2^n_bits - 1     2^n_bits     2^n_bits - 1     2^n_bits
+     *
+     * Hence if f = u / (2^n_bits - 1) is exactly representable on a
+     * n_bits palette, all the numbers between
+     *
+     *     u
+     * ----------  =  f - f * 2^n_bits = f + (0 - f) * 2^n_bits
+     *  2^n_bits
+     *
+     *  and
+     *
+     *    u + 1
+     * ---------- = f - (f - 1) * 2^n_bits = f + (1 - f) * 2^n_bits
+     *  2^n_bits
+     *
+     * are also mapped back to u.
+     *
+     * Hence the following calculation ensures that we add as much
+     * noise as possible without perturbing values which are exactly
+     * representable in the target colorspace.  Note that this corresponds to
+     * mixing the original color with noise with a ratio of `1 / 2^n_bits`.
+     */
+    return f + (d - f) * s;
+}
+
+static force_inline float
+dither_compute_scale (int n_bits)
+{
+    // No dithering for wide formats
+    if (n_bits == 0 || n_bits >= 32)
+	return 0.f;
+
+    return 1.f / (float)(1 << n_bits);
+}
+
+static const uint32_t *
+dither_apply_ordered (pixman_iter_t *iter, dither_factor_t factor)
+{
+    bits_image_t        *image  = &iter->image->bits;
+    int                  x      = iter->x + image->dither_offset_x;
+    int                  y      = iter->y + image->dither_offset_y;
+    int                  width  = iter->width;
+    argb_t              *buffer = (argb_t *)iter->buffer;
+
+    pixman_format_code_t format = image->format;
+    int                  a_size = PIXMAN_FORMAT_A (format);
+    int                  r_size = PIXMAN_FORMAT_R (format);
+    int                  g_size = PIXMAN_FORMAT_G (format);
+    int                  b_size = PIXMAN_FORMAT_B (format);
+
+    float a_scale = dither_compute_scale (a_size);
+    float r_scale = dither_compute_scale (r_size);
+    float g_scale = dither_compute_scale (g_size);
+    float b_scale = dither_compute_scale (b_size);
+
+    int   i;
+    float d;
+
+    for (i = 0; i < width; ++i)
+    {
+	d = factor (x + i, y);
+
+	buffer->a = dither_apply_channel (buffer->a, d, a_scale);
+	buffer->r = dither_apply_channel (buffer->r, d, r_scale);
+	buffer->g = dither_apply_channel (buffer->g, d, g_scale);
+	buffer->b = dither_apply_channel (buffer->b, d, b_scale);
+
+	buffer++;
+    }
+
+    return iter->buffer;
+}
+
 static void
 dest_write_back_wide (pixman_iter_t *iter)
 {
     bits_image_t *  image  = &iter->image->bits;
     int             x      = iter->x;
     int             y      = iter->y;
     int             width  = iter->width;
     const uint32_t *buffer = iter->buffer;
 
+    switch (image->dither)
+    {
+    case PIXMAN_DITHER_NONE:
+	break;
+
+    case PIXMAN_DITHER_GOOD:
+    case PIXMAN_DITHER_BEST:
+    case PIXMAN_DITHER_ORDERED_BLUE_NOISE_64:
+	buffer = dither_apply_ordered (iter, dither_factor_blue_noise_64);
+	break;
+
+    case PIXMAN_DITHER_FAST:
+    case PIXMAN_DITHER_ORDERED_BAYER_8:
+	buffer = dither_apply_ordered (iter, dither_factor_bayer_8);
+	break;
+    }
+
     image->store_scanline_float (image, x, y, width, buffer);
 
     if (image->common.alpha_map)
     {
 	x -= image->common.alpha_origin_x;
 	y -= image->common.alpha_origin_y;
 
 	image->common.alpha_map->store_scanline_float (
@@ -1665,41 +1202,28 @@ dest_write_back_wide (pixman_iter_t *ite
     }
 
     iter->y++;
 }
 
 void
 _pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
 {
-    if (iter->iter_flags & ITER_16)
-    {
-        if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
-	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
-	{
-            iter->get_scanline = _pixman_iter_get_scanline_noop;
-        }
-        else
-        {
-	    iter->get_scanline = dest_get_scanline_16;
-        }
-	iter->write_back = dest_write_back_16;
-    }
-    else if (iter->iter_flags & ITER_NARROW)
+    if (iter->iter_flags & ITER_NARROW)
     {
 	if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
 	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
 	{
 	    iter->get_scanline = _pixman_iter_get_scanline_noop;
 	}
 	else
 	{
 	    iter->get_scanline = dest_get_scanline_narrow;
 	}
-
+	
 	iter->write_back = dest_write_back_narrow;
     }
     else
     {
 	iter->get_scanline = dest_get_scanline_wide;
 	iter->write_back = dest_write_back_wide;
     }
 }
@@ -1753,16 +1277,19 @@ pixman_bool_t
                          int                  width,
                          int                  height,
                          uint32_t *           bits,
                          int                  rowstride,
 			 pixman_bool_t	      clear)
 {
     uint32_t *free_me = NULL;
 
+    if (PIXMAN_FORMAT_BPP (format) == 128)
+	return_val_if_fail(!(rowstride % 4), FALSE);
+
     if (!bits && width && height)
     {
 	int rowstride_bytes;
 
 	free_me = bits = create_bits (format, width, height, &rowstride_bytes, clear);
 
 	if (!bits)
 	    return FALSE;
@@ -1773,16 +1300,19 @@ pixman_bool_t
     _pixman_image_init (image);
 
     image->type = BITS;
     image->bits.format = format;
     image->bits.width = width;
     image->bits.height = height;
     image->bits.bits = bits;
     image->bits.free_me = free_me;
+    image->bits.dither = PIXMAN_DITHER_NONE;
+    image->bits.dither_offset_x = 0;
+    image->bits.dither_offset_y = 0;
     image->bits.read_func = NULL;
     image->bits.write_func = NULL;
     image->bits.rowstride = rowstride;
     image->bits.indexed = NULL;
 
     image->common.property_changed = bits_image_property_changed;
 
     _pixman_image_reset_clip_region (image);
--- a/gfx/cairo/libpixman/src/pixman-combine-float.c
+++ b/gfx/cairo/libpixman/src/pixman-combine-float.c
@@ -37,18 +37,16 @@
 
 /* Workaround for http://gcc.gnu.org/PR54965 */
 /* GCC 4.6 has problems with force_inline, so just use normal inline instead */
 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 6)
 #undef force_inline
 #define force_inline __inline__
 #endif
 
-#define IS_ZERO(f)     (-FLT_MIN < (f) && (f) < FLT_MIN)
-
 typedef float (* combine_channel_t) (float sa, float s, float da, float d);
 
 static force_inline void
 combine_inner (pixman_bool_t component,
 	       float *dest, const float *src, const float *mask, int n_pixels,
 	       combine_channel_t combine_a, combine_channel_t combine_c)
 {
     int i;
@@ -198,66 +196,66 @@ get_factor (combine_factor_t factor, flo
 	f = 1 - sa;
 	break;
 
     case INV_DA:
 	f = 1 - da;
 	break;
 
     case SA_OVER_DA:
-	if (IS_ZERO (da))
+	if (FLOAT_IS_ZERO (da))
 	    f = 1.0f;
 	else
 	    f = CLAMP (sa / da);
 	break;
 
     case DA_OVER_SA:
-	if (IS_ZERO (sa))
+	if (FLOAT_IS_ZERO (sa))
 	    f = 1.0f;
 	else
 	    f = CLAMP (da / sa);
 	break;
 
     case INV_SA_OVER_DA:
-	if (IS_ZERO (da))
+	if (FLOAT_IS_ZERO (da))
 	    f = 1.0f;
 	else
 	    f = CLAMP ((1.0f - sa) / da);
 	break;
 
     case INV_DA_OVER_SA:
-	if (IS_ZERO (sa))
+	if (FLOAT_IS_ZERO (sa))
 	    f = 1.0f;
 	else
 	    f = CLAMP ((1.0f - da) / sa);
 	break;
 
     case ONE_MINUS_SA_OVER_DA:
-	if (IS_ZERO (da))
+	if (FLOAT_IS_ZERO (da))
 	    f = 0.0f;
 	else
 	    f = CLAMP (1.0f - sa / da);
 	break;
 
     case ONE_MINUS_DA_OVER_SA:
-	if (IS_ZERO (sa))
+	if (FLOAT_IS_ZERO (sa))
 	    f = 0.0f;
 	else
 	    f = CLAMP (1.0f - da / sa);
 	break;
 
     case ONE_MINUS_INV_DA_OVER_SA:
-	if (IS_ZERO (sa))
+	if (FLOAT_IS_ZERO (sa))
 	    f = 0.0f;
 	else
 	    f = CLAMP (1.0f - (1.0f - da) / sa);
 	break;
 
     case ONE_MINUS_INV_SA_OVER_DA:
-	if (IS_ZERO (da))
+	if (FLOAT_IS_ZERO (da))
 	    f = 0.0f;
 	else
 	    f = CLAMP (1.0f - (1.0f - sa) / da);
 	break;
     }
 
     return f;
 }
@@ -316,33 +314,54 @@ MAKE_PD_COMBINERS (conjoint_atop,		DA_OV
 MAKE_PD_COMBINERS (conjoint_atop_reverse,	ONE_MINUS_DA_OVER_SA,		SA_OVER_DA)
 MAKE_PD_COMBINERS (conjoint_xor,		ONE_MINUS_DA_OVER_SA,		ONE_MINUS_SA_OVER_DA)
 
 /*
  * PDF blend modes:
  *
  * The following blend modes have been taken from the PDF ISO 32000
  * specification, which at this point in time is available from
- * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
- * The relevant chapters are 11.3.5 and 11.3.6.
+ *
+ *     http://www.adobe.com/devnet/pdf/pdf_reference.html
+ *
+ * The specific documents of interest are the PDF spec itself:
+ *
+ *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf
+ *
+ * chapters 11.3.5 and 11.3.6 and a later supplement for Adobe Acrobat
+ * 9.1 and Reader 9.1:
+ *
+ *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000_1.pdf
+ *
+ * that clarifies the specifications for blend modes ColorDodge and
+ * ColorBurn.
+ *
  * The formula for computing the final pixel color given in 11.3.6 is:
- * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
- * with B() being the blend function.
- * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ *     αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
  *
- * These blend modes should match the SVG filter draft specification, as
- * it has been designed to mirror ISO 32000. Note that at the current point
- * no released draft exists that shows this, as the formulas have not been
- * updated yet after the release of ISO 32000.
+ * with B() is the blend function. When B(Cb, Cs) = Cs, this formula
+ * reduces to the regular OVER operator.
+ *
+ * Cs and Cb are not premultiplied, so in our implementation we instead
+ * use:
+ *
+ *     cr = (1 – αs) × cb  +  (1 – αb) × cs  +  αb × αs × B (cb/αb, cs/αs)
+ *
+ * where cr, cs, and cb are premultiplied colors, and where the
  *
- * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
- * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
- * argument. Note that this implementation operates on premultiplied colors,
- * while the PDF specification does not. Therefore the code uses the formula
- * ar.Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ *     αb × αs × B(cb/αb, cs/αs)
+ *
+ * part is first arithmetically simplified under the assumption that αb
+ * and αs are not 0, and then updated to produce a meaningful result when
+ * they are.
+ *
+ * For all the blend mode operators, the alpha channel is given by
+ *
+ *     αr = αs + αb + αb × αs
  */
 
 #define MAKE_SEPARABLE_PDF_COMBINERS(name)				\
     static force_inline float						\
     combine_ ## name ## _a (float sa, float s, float da, float d)	\
     {									\
 	return da + sa - da * sa;					\
     }									\
@@ -352,134 +371,278 @@ MAKE_PD_COMBINERS (conjoint_xor,		ONE_MI
     {									\
 	float f = (1 - sa) * d + (1 - da) * s;				\
 									\
 	return f + blend_ ## name (sa, s, da, d);			\
     }									\
     									\
     MAKE_COMBINERS (name, combine_ ## name ## _a, combine_ ## name ## _c)
 
+/*
+ * Multiply
+ *
+ *      ad * as * B(d / ad, s / as)
+ *    = ad * as * d/ad * s/as
+ *    = d * s
+ *
+ */
 static force_inline float
 blend_multiply (float sa, float s, float da, float d)
 {
     return d * s;
 }
 
+/*
+ * Screen
+ *
+ *      ad * as * B(d/ad, s/as)
+ *    = ad * as * (d/ad + s/as - s/as * d/ad)
+ *    = ad * s + as * d - s * d
+ */
 static force_inline float
 blend_screen (float sa, float s, float da, float d)
 {
     return d * sa + s * da - s * d;
 }
 
+/*
+ * Overlay
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * Hardlight (s, d)
+ *   = if (d / ad < 0.5)
+ *         as * ad * Multiply (s/as, 2 * d/ad)
+ *     else
+ *         as * ad * Screen (s/as, 2 * d / ad - 1)
+ *   = if (d < 0.5 * ad)
+ *         as * ad * s/as * 2 * d /ad
+ *     else
+ *         as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1))
+ *   = if (2 * d < ad)
+ *         2 * s * d
+ *     else
+ *         ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1)
+ *   = if (2 * d < ad)
+ *         2 * s * d
+ *     else
+ *         as * ad - 2 * (ad - d) * (as - s)
+ */
 static force_inline float
 blend_overlay (float sa, float s, float da, float d)
 {
     if (2 * d < da)
 	return 2 * s * d;
     else
 	return sa * da - 2 * (da - d) * (sa - s);
 }
 
+/*
+ * Darken
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * MIN(d/ad, s/as)
+ *   = MIN (as * d, ad * s)
+ */
 static force_inline float
 blend_darken (float sa, float s, float da, float d)
 {
     s = s * da;
     d = d * sa;
 
     if (s > d)
 	return d;
     else
 	return s;
 }
 
+/*
+ * Lighten
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * MAX(d/ad, s/as)
+ *   = MAX (as * d, ad * s)
+ */
 static force_inline float
 blend_lighten (float sa, float s, float da, float d)
 {
     s = s * da;
     d = d * sa;
 
     if (s > d)
 	return s;
     else
 	return d;
 }
 
+/*
+ * Color dodge
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if d/ad = 0
+ *         ad * as * 0
+ *     else if (d/ad >= (1 - s/as)
+ *         ad * as * 1
+ *     else
+ *         ad * as * ((d/ad) / (1 - s/as))
+ *   = if d = 0
+ *         0
+ *     elif as * d >= ad * (as - s)
+ *         ad * as
+ *     else
+ *         as * (as * d / (as - s))
+ *
+ */
 static force_inline float
 blend_color_dodge (float sa, float s, float da, float d)
 {
-    if (IS_ZERO (d))
+    if (FLOAT_IS_ZERO (d))
 	return 0.0f;
     else if (d * sa >= sa * da - s * da)
 	return sa * da;
-    else if (IS_ZERO (sa - s))
+    else if (FLOAT_IS_ZERO (sa - s))
 	return sa * da;
     else
 	return sa * sa * d / (sa - s);
 }
 
+/*
+ * Color burn
+ *
+ * We modify the first clause "if d = 1" to "if d >= 1" since with
+ * premultiplied colors d > 1 can actually happen.
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if d/ad >= 1
+ *         ad * as * 1
+ *     elif (1 - d/ad) >= s/as
+ *         ad * as * 0
+ *     else
+ *         ad * as * (1 - ((1 - d/ad) / (s/as)))
+ *   = if d >= ad
+ *         ad * as
+ *     elif as * ad - as * d >= ad * s
+ *         0
+ *     else
+ *         ad * as  - as * as * (ad - d) / s
+ */
 static force_inline float
 blend_color_burn (float sa, float s, float da, float d)
 {
     if (d >= da)
 	return sa * da;
     else if (sa * (da - d) >= s * da)
 	return 0.0f;
-    else if (IS_ZERO (s))
+    else if (FLOAT_IS_ZERO (s))
 	return 0.0f;
     else
 	return sa * (da - sa * (da - d) / s);
 }
 
+/*
+ * Hard light
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if (s/as <= 0.5)
+ *         ad * as * Multiply (d/ad, 2 * s/as)
+ *     else
+ *         ad * as * Screen (d/ad, 2 * s/as - 1)
+ *   = if 2 * s <= as
+ *         ad * as * d/ad * 2 * s / as
+ *     else
+ *         ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1))
+ *   = if 2 * s <= as
+ *         2 * s * d
+ *     else
+ *         as * ad - 2 * (ad - d) * (as - s)
+ */
 static force_inline float
 blend_hard_light (float sa, float s, float da, float d)
 {
     if (2 * s < sa)
 	return 2 * s * d;
     else
 	return sa * da - 2 * (da - d) * (sa - s);
 }
 
+/*
+ * Soft light
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if (s/as <= 0.5)
+ *         ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad))
+ *     else if (d/ad <= 0.25)
+ *         ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad))
+ *     else
+ *         ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad))
+ *   = if (2 * s <= as)
+ *         d * as - d * (ad - d) * (as - 2 * s) / ad;
+ *     else if (4 * d <= ad)
+ *         (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
+ *     else
+ *         d * as + (sqrt (d * ad) - d) * (2 * s - as);
+ */
 static force_inline float
 blend_soft_light (float sa, float s, float da, float d)
 {
-    if (2 * s < sa)
+    if (2 * s <= sa)
     {
-	if (IS_ZERO (da))
+	if (FLOAT_IS_ZERO (da))
 	    return d * sa;
 	else
 	    return d * sa - d * (da - d) * (sa - 2 * s) / da;
     }
     else
     {
-	if (IS_ZERO (da))
+	if (FLOAT_IS_ZERO (da))
 	{
-	    return 0.0f;
+	    return d * sa;
 	}
 	else
 	{
 	    if (4 * d <= da)
 		return d * sa + (2 * s - sa) * d * ((16 * d / da - 12) * d / da + 3);
 	    else
 		return d * sa + (sqrtf (d * da) - d) * (2 * s - sa);
 	}
     }
 }
 
+/*
+ * Difference
+ *
+ *     ad * as * B(s/as, d/ad)
+ *   = ad * as * abs (s/as - d/ad)
+ *   = if (s/as <= d/ad)
+ *         ad * as * (d/ad - s/as)
+ *     else
+ *         ad * as * (s/as - d/ad)
+ *   = if (ad * s <= as * d)
+ *        as * d - ad * s
+ *     else
+ *        ad * s - as * d
+ */
 static force_inline float
 blend_difference (float sa, float s, float da, float d)
 {
     float dsa = d * sa;
     float sda = s * da;
 
     if (sda < dsa)
 	return dsa - sda;
     else
 	return sda - dsa;
 }
 
+/*
+ * Exclusion
+ *
+ *     ad * as * B(s/as, d/ad)
+ *   = ad * as * (d/ad + s/as - 2 * d/ad * s/as)
+ *   = as * d + ad * s - 2 * s * d
+ */
 static force_inline float
 blend_exclusion (float sa, float s, float da, float d)
 {
     return s * da + d * sa - 2 * d * s;
 }
 
 MAKE_SEPARABLE_PDF_COMBINERS (multiply)
 MAKE_SEPARABLE_PDF_COMBINERS (screen)
@@ -489,126 +652,89 @@ MAKE_SEPARABLE_PDF_COMBINERS (lighten)
 MAKE_SEPARABLE_PDF_COMBINERS (color_dodge)
 MAKE_SEPARABLE_PDF_COMBINERS (color_burn)
 MAKE_SEPARABLE_PDF_COMBINERS (hard_light)
 MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
 MAKE_SEPARABLE_PDF_COMBINERS (difference)
 MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
 
 /*
- * PDF nonseperable blend modes.
- *
- * These are implemented using the following functions to operate in Hsl
- * space, with Cmax, Cmid, Cmin referring to the max, mid and min value
- * of the red, green and blue components.
+ * PDF nonseperable blend modes are implemented using the following functions
+ * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
+ * and min value of the red, green and blue components.
  *
  * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
  *
  * clip_color (C):
- *   l = LUM (C)
- *   min = Cmin
- *   max = Cmax
- *   if n < 0.0
- *     C = l + (((C – l) × l) ⁄     (l – min))
- *   if x > 1.0
- *     C = l + (((C – l) × (1 – l)) (max – l))
- *   return C
+ *     l = LUM (C)
+ *     min = Cmin
+ *     max = Cmax
+ *     if n < 0.0
+ *         C = l + (((C – l) × l) ⁄ (l – min))
+ *     if x > 1.0
+ *         C = l + (((C – l) × (1 – l) ) ⁄ (max – l))
+ *     return C
  *
  * set_lum (C, l):
- *   d = l – LUM (C)
- *   C += d
- *   return clip_color (C)
+ *     d = l – LUM (C)
+ *     C += d
+ *     return clip_color (C)
  *
  * SAT (C) = CH_MAX (C) - CH_MIN (C)
  *
  * set_sat (C, s):
- *  if Cmax > Cmin
- *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
- *    Cmax = s
- *  else
- *    Cmid = Cmax = 0.0
- *  Cmin = 0.0
- *  return C
+ *     if Cmax > Cmin
+ *         Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *         Cmax = s
+ *     else
+ *         Cmid = Cmax = 0.0
+ *         Cmin = 0.0
+ *     return C
  */
 
 /* For premultiplied colors, we need to know what happens when C is
  * multiplied by a real number. LUM and SAT are linear:
  *
- *    LUM (r × C) = r × LUM (C)		SAT (r × C) = r × SAT (C)
+ *     LUM (r × C) = r × LUM (C)	SAT (r * C) = r * SAT (C)
  *
  * If we extend clip_color with an extra argument a and change
  *
- *        if x >= 1.0
+ *     if x >= 1.0
  *
  * into
  *
- *        if x >= a
+ *     if x >= a
  *
  * then clip_color is also linear:
  *
- *     r * clip_color (C, a) = clip_color (r_c, ra);
+ *     r * clip_color (C, a) = clip_color (r * C, r * a);
  *
  * for positive r.
  *
  * Similarly, we can extend set_lum with an extra argument that is just passed
  * on to clip_color:
  *
- *     r × set_lum ( C, l, a)
+ *       r * set_lum (C, l, a)
  *
- *   = r × clip_color ( C + l - LUM (C), a)
+ *     = r × clip_color (C + l - LUM (C), a)
  *
- *   = clip_color ( r * C + r × l - LUM (r × C), r * a)
+ *     = clip_color (r * C + r × l - r * LUM (C), r * a)
  *
- *   = set_lum ( r * C, r * l, r * a)
+ *     = set_lum (r * C, r * l, r * a)
  *
  * Finally, set_sat:
  *
- *     r * set_sat (C, s) = set_sat (x * C, r * s)
+ *       r * set_sat (C, s) = set_sat (x * C, r * s)
  *
- * The above holds for all non-zero x because they x'es in the fraction for
+ * The above holds for all non-zero x, because the x'es in the fraction for
  * C_mid cancel out. Specifically, it holds for x = r:
  *
- *     r * set_sat (C, s) = set_sat (r_c, rs)
- *
- *
- *
- *
- * So, for the non-separable PDF blend modes, we have (using s, d for
- * non-premultiplied colors, and S, D for premultiplied:
- *
- *   Color:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
- *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
- *
- *
- *   Luminosity:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
- *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
- *
- *
- *   Saturation:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
- *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
- *                                        a_s * LUM (D), a_s * a_d)
- *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
- *
- *   Hue:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
- *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *       r * set_sat (C, s) = set_sat (r * C, r * s)
  *
  */
-
 typedef struct
 {
     float	r;
     float	g;
     float	b;
 } rgb_t;
 
 static force_inline float
@@ -653,33 +779,33 @@ clip_color (rgb_t *color, float a)
     float l = get_lum (color);
     float n = channel_min (color);
     float x = channel_max (color);
     float t;
 
     if (n < 0.0f)
     {
 	t = l - n;
-	if (IS_ZERO (t))
+	if (FLOAT_IS_ZERO (t))
 	{
 	    color->r = 0.0f;
 	    color->g = 0.0f;
 	    color->b = 0.0f;
 	}
 	else
 	{
 	    color->r = l + (((color->r - l) * l) / t);
 	    color->g = l + (((color->g - l) * l) / t);
 	    color->b = l + (((color->b - l) * l) / t);
 	}
     }
     if (x > a)
     {
 	t = x - l;
-	if (IS_ZERO (t))
+	if (FLOAT_IS_ZERO (t))
 	{
 	    color->r = a;
 	    color->g = a;
 	    color->b = a;
 	}
 	else
 	{
 	    color->r = l + (((color->r - l) * (a - l) / t));
@@ -753,82 +879,96 @@ set_sat (rgb_t *src, float sat)
 		max = &(src->b);
 		mid = &(src->g);
 	    }
 	}
     }
 
     t = *max - *min;
 
-    if (IS_ZERO (t))
+    if (FLOAT_IS_ZERO (t))
     {
 	*mid = *max = 0.0f;
     }
     else
     {
 	*mid = ((*mid - *min) * sat) / t;
 	*max = sat;
     }
 
     *min = 0.0f;
 }
 
-/*
- * Hue:
- * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+/* Hue:
+ *
+ *       as * ad * B(s/as, d/as)
+ *     = as * ad * set_lum (set_sat (s/as, SAT (d/ad)), LUM (d/ad), 1)
+ *     = set_lum (set_sat (ad * s, as * SAT (d)), as * LUM (d), as * ad)
+ *
  */
 static force_inline void
 blend_hsl_hue (rgb_t *res,
 	       const rgb_t *dest, float da,
 	       const rgb_t *src, float sa)
 {
     res->r = src->r * da;
     res->g = src->g * da;
     res->b = src->b * da;
 
     set_sat (res, get_sat (dest) * sa);
     set_lum (res, sa * da, get_lum (dest) * sa);
 }
 
-/*
- * Saturation:
- * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+/* 
+ * Saturation
+ *
+ *     as * ad * B(s/as, d/ad)
+ *   = as * ad * set_lum (set_sat (d/ad, SAT (s/as)), LUM (d/ad), 1)
+ *   = set_lum (as * ad * set_sat (d/ad, SAT (s/as)),
+ *                                       as * LUM (d), as * ad)
+ *   = set_lum (set_sat (as * d, ad * SAT (s), as * LUM (d), as * ad))
  */
 static force_inline void
 blend_hsl_saturation (rgb_t *res,
 		      const rgb_t *dest, float da,
 		      const rgb_t *src, float sa)
 {
     res->r = dest->r * sa;
     res->g = dest->g * sa;
     res->b = dest->b * sa;
 
     set_sat (res, get_sat (src) * da);
     set_lum (res, sa * da, get_lum (dest) * sa);
 }
 
-/*
- * Color:
- * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+/* 
+ * Color
+ *
+ *     as * ad * B(s/as, d/as)
+ *   = as * ad * set_lum (s/as, LUM (d/ad), 1)
+ *   = set_lum (s * ad, as * LUM (d), as * ad)
  */
 static force_inline void
 blend_hsl_color (rgb_t *res,
 		 const rgb_t *dest, float da,
 		 const rgb_t *src, float sa)
 {
     res->r = src->r * da;
     res->g = src->g * da;
     res->b = src->b * da;
 
     set_lum (res, sa * da, get_lum (dest) * sa);
 }
 
 /*
- * Luminosity:
- * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ * Luminosity
+ *
+ *     as * ad * B(s/as, d/ad)
+ *   = as * ad * set_lum (d/ad, LUM (s/as), 1)
+ *   = set_lum (as * d, ad * LUM (s), as * ad)
  */
 static force_inline void
 blend_hsl_luminosity (rgb_t *res,
 		      const rgb_t *dest, float da,
 		      const rgb_t *src, float sa)
 {
     res->r = dest->r * sa;
     res->g = dest->g * sa;
deleted file mode 100644
--- a/gfx/cairo/libpixman/src/pixman-combine.c.template
+++ /dev/null
@@ -1,2461 +0,0 @@
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <math.h>
-#include <string.h>
-
-#include "pixman-private.h"
-
-#include "pixman-combine.h"
-
-/*** per channel helper functions ***/
-
-static void
-combine_mask_ca (comp4_t *src, comp4_t *mask)
-{
-    comp4_t a = *mask;
-
-    comp4_t x;
-    comp2_t xa;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    x = *(src);
-    if (a == ~0)
-    {
-	x = x >> A_SHIFT;
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    xa = x >> A_SHIFT;
-    UNcx4_MUL_UNcx4 (x, a);
-    *(src) = x;
-    
-    UNcx4_MUL_UNc (a, xa);
-    *(mask) = a;
-}
-
-static void
-combine_mask_value_ca (comp4_t *src, const comp4_t *mask)
-{
-    comp4_t a = *mask;
-    comp4_t x;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    if (a == ~0)
-	return;
-
-    x = *(src);
-    UNcx4_MUL_UNcx4 (x, a);
-    *(src) = x;
-}
-
-static void
-combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
-{
-    comp4_t a = *(mask);
-    comp4_t x;
-
-    if (!a)
-	return;
-
-    x = *(src) >> A_SHIFT;
-    if (x == MASK)
-	return;
-
-    if (a == ~0)
-    {
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    UNcx4_MUL_UNc (a, x);
-    *(mask) = a;
-}
-
-/*
- * There are two ways of handling alpha -- either as a single unified value or
- * a separate value for each component, hence each macro must have two
- * versions.  The unified alpha version has a 'U' at the end of the name,
- * the component version has a 'C'.  Similarly, functions which deal with
- * this difference will have two versions using the same convention.
- */
-
-/*
- * All of the composing functions
- */
-
-static force_inline comp4_t
-combine_mask (const comp4_t *src, const comp4_t *mask, int i)
-{
-    comp4_t s, m;
-
-    if (mask)
-    {
-	m = *(mask + i) >> A_SHIFT;
-
-	if (!m)
-	    return 0;
-    }
-
-    s = *(src + i);
-
-    if (mask)
-	UNcx4_MUL_UNc (s, m);
-
-    return s;
-}
-
-static void
-combine_clear (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    memset (dest, 0, width * sizeof(comp4_t));
-}
-
-static void
-combine_dst (pixman_implementation_t *imp,
-	     pixman_op_t	      op,
-	     comp4_t *		      dest,
-	     const comp4_t *	      src,
-	     const comp4_t *          mask,
-	     int		      width)
-{
-    return;
-}
-
-static void
-combine_src_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    if (!mask)
-	memcpy (dest, src, width * sizeof (comp4_t));
-    else
-    {
-	for (i = 0; i < width; ++i)
-	{
-	    comp4_t s = combine_mask (src, mask, i);
-
-	    *(dest + i) = s;
-	}
-    }
-}
-
-/* if the Src is opaque, call combine_src_u */
-static void
-combine_over_u (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t ia = ALPHA_c (~s);
-
-	UNcx4_MUL_UNc_ADD_UNcx4 (d, ia, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, this is a noop */
-static void
-combine_over_reverse_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t ia = ALPHA_c (~*(dest + i));
-	UNcx4_MUL_UNc_ADD_UNcx4 (s, ia, d);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Dst is opaque, call combine_src_u */
-static void
-combine_in_u (pixman_implementation_t *imp,
-              pixman_op_t              op,
-              comp4_t *                dest,
-              const comp4_t *          src,
-              const comp4_t *          mask,
-              int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t a = ALPHA_c (*(dest + i));
-	UNcx4_MUL_UNc (s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, this is a noop */
-static void
-combine_in_reverse_u (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      comp4_t *                dest,
-                      const comp4_t *          src,
-                      const comp4_t *          mask,
-                      int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t a = ALPHA_c (s);
-	UNcx4_MUL_UNc (d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, call combine_clear */
-static void
-combine_out_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t a = ALPHA_c (~*(dest + i));
-	UNcx4_MUL_UNc (s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_clear */
-static void
-combine_out_reverse_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t a = ALPHA_c (~s);
-	UNcx4_MUL_UNc (d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call combine_in_u */
-/* if the Dst is opaque, call combine_over_u */
-/* if both the Src and Dst are opaque, call combine_src_u */
-static void
-combine_atop_u (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t dest_a = ALPHA_c (d);
-	comp4_t src_ia = ALPHA_c (~s);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_a, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_over_reverse_u */
-/* if the Dst is opaque, call combine_in_reverse_u */
-/* if both the Src and Dst are opaque, call combine_dst_u */
-static void
-combine_atop_reverse_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t src_a = ALPHA_c (s);
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_over_u */
-/* if the Dst is opaque, call combine_over_reverse_u */
-/* if both the Src and Dst are opaque, call combine_clear */
-static void
-combine_xor_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t src_ia = ALPHA_c (~s);
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_add_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	UNcx4_ADD_UNcx4 (d, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call combine_add_u */
-/* if the Dst is opaque, call combine_add_u */
-/* if both the Src and Dst are opaque, call combine_add_u */
-static void
-combine_saturate_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    comp4_t *                dest,
-                    const comp4_t *          src,
-                    const comp4_t *          mask,
-                    int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp2_t sa, da;
-
-	sa = s >> A_SHIFT;
-	da = ~d >> A_SHIFT;
-	if (sa > da)
-	{
-	    sa = DIV_UNc (da, sa);
-	    UNcx4_MUL_UNc (s, sa);
-	}
-	;
-	UNcx4_ADD_UNcx4 (d, s);
-	*(dest + i) = d;
-    }
-}
-
-/*
- * PDF blend modes:
- * The following blend modes have been taken from the PDF ISO 32000
- * specification, which at this point in time is available from
- * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
- * The relevant chapters are 11.3.5 and 11.3.6.
- * The formula for computing the final pixel color given in 11.3.6 is:
- * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
- * with B() being the blend function.
- * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
- *
- * These blend modes should match the SVG filter draft specification, as
- * it has been designed to mirror ISO 32000. Note that at the current point
- * no released draft exists that shows this, as the formulas have not been
- * updated yet after the release of ISO 32000.
- *
- * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
- * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
- * argument. Note that this implementation operates on premultiplied colors,
- * while the PDF specification does not. Therefore the code uses the formula
- * Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
- */
-
-/*
- * Multiply
- * B(Dca, ad, Sca, as) = Dca.Sca
- */
-
-static void
-combine_multiply_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    comp4_t *                dest,
-                    const comp4_t *          src,
-                    const comp4_t *          mask,
-                    int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t ss = s;
-	comp4_t src_ia = ALPHA_c (~s);
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (ss, dest_ia, d, src_ia);
-	UNcx4_MUL_UNcx4 (d, s);
-	UNcx4_ADD_UNcx4 (d, ss);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_multiply_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     comp4_t *                dest,
-                     const comp4_t *          src,
-                     const comp4_t *          mask,
-                     int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t m = *(mask + i);
-	comp4_t s = *(src + i);
-	comp4_t d = *(dest + i);
-	comp4_t r = d;
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	combine_mask_value_ca (&s, &m);
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (r, ~m, s, dest_ia);
-	UNcx4_MUL_UNcx4 (d, s);
-	UNcx4_ADD_UNcx4 (r, d);
-
-	*(dest + i) = r;
-    }
-}
-
-#define PDF_SEPARABLE_BLEND_MODE(name)					\
-    static void								\
-    combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t              op,		\
-                            comp4_t *                dest,		\
-			    const comp4_t *          src,		\
-			    const comp4_t *          mask,		\
-			    int                      width)		\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i) {					\
-	    comp4_t s = combine_mask (src, mask, i);			\
-	    comp4_t d = *(dest + i);					\
-	    comp1_t sa = ALPHA_c (s);					\
-	    comp1_t isa = ~sa;						\
-	    comp1_t da = ALPHA_c (d);					\
-	    comp1_t ida = ~da;						\
-	    comp4_t result;						\
-									\
-	    result = d;							\
-	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
-	    								\
-	    *(dest + i) = result +					\
-		(DIV_ONE_UNc (sa * (comp4_t)da) << A_SHIFT) +		\
-		(blend_ ## name (RED_c (d), da, RED_c (s), sa) << R_SHIFT) + \
-		(blend_ ## name (GREEN_c (d), da, GREEN_c (s), sa) << G_SHIFT) + \
-		(blend_ ## name (BLUE_c (d), da, BLUE_c (s), sa));	\
-	}								\
-    }									\
-    									\
-    static void								\
-    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
-			     pixman_op_t              op,		\
-                             comp4_t *                dest,		\
-			     const comp4_t *          src,		\
-			     const comp4_t *          mask,		\
-			     int                     width)		\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i) {					\
-	    comp4_t m = *(mask + i);					\
-	    comp4_t s = *(src + i);					\
-	    comp4_t d = *(dest + i);					\
-	    comp1_t da = ALPHA_c (d);					\
-	    comp1_t ida = ~da;						\
-	    comp4_t result;						\
-            								\
-	    combine_mask_value_ca (&s, &m);				\
-            								\
-	    result = d;							\
-	    UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (result, ~m, s, ida);     \
-            								\
-	    result +=							\
-	        (DIV_ONE_UNc (ALPHA_c (m) * (comp4_t)da) << A_SHIFT) +	\
-	        (blend_ ## name (RED_c (d), da, RED_c (s), RED_c (m)) << R_SHIFT) + \
-	        (blend_ ## name (GREEN_c (d), da, GREEN_c (s), GREEN_c (m)) << G_SHIFT) + \
-	        (blend_ ## name (BLUE_c (d), da, BLUE_c (s), BLUE_c (m))); \
-	    								\
-	    *(dest + i) = result;					\
-	}								\
-    }
-
-/*
- * Screen
- * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
- */
-static inline comp4_t
-blend_screen (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    return DIV_ONE_UNc (sca * da + dca * sa - sca * dca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (screen)
-
-/*
- * Overlay
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Dca < Da
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
- */
-static inline comp4_t
-blend_overlay (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t rca;
-
-    if (2 * dca < da)
-	rca = 2 * sca * dca;
-    else
-	rca = sa * da - 2 * (da - dca) * (sa - sca);
-    return DIV_ONE_UNc (rca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (overlay)
-
-/*
- * Darken
- * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
- */
-static inline comp4_t
-blend_darken (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
-    return DIV_ONE_UNc (s > d ? d : s);
-}
-
-PDF_SEPARABLE_BLEND_MODE (darken)
-
-/*
- * Lighten
- * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
- */
-static inline comp4_t
-blend_lighten (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
-    return DIV_ONE_UNc (s > d ? s : d);
-}
-
-PDF_SEPARABLE_BLEND_MODE (lighten)
-
-/*
- * Color dodge
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == 0
- *     0
- *   if Sca == Sa
- *     Sa.Da
- *   otherwise
- *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
- */
-static inline comp4_t
-blend_color_dodge (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    if (sca >= sa)
-    {
-	return dca == 0 ? 0 : DIV_ONE_UNc (sa * da);
-    }
-    else
-    {
-	comp4_t rca = dca * sa / (sa - sca);
-	return DIV_ONE_UNc (sa * MIN (rca, da));
-    }
-}
-
-PDF_SEPARABLE_BLEND_MODE (color_dodge)
-
-/*
- * Color burn
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == Da
- *     Sa.Da
- *   if Sca == 0
- *     0
- *   otherwise
- *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
- */
-static inline comp4_t
-blend_color_burn (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    if (sca == 0)
-    {
-	return dca < da ? 0 : DIV_ONE_UNc (sa * da);
-    }
-    else
-    {
-	comp4_t rca = (da - dca) * sa / sca;
-	return DIV_ONE_UNc (sa * (MAX (rca, da) - rca));
-    }
-}
-
-PDF_SEPARABLE_BLEND_MODE (color_burn)
-
-/*
- * Hard light
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Sca < Sa
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
- */
-static inline comp4_t
-blend_hard_light (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    if (2 * sca < sa)
-	return DIV_ONE_UNc (2 * sca * dca);
-    else
-	return DIV_ONE_UNc (sa * da - 2 * (da - dca) * (sa - sca));
-}
-
-PDF_SEPARABLE_BLEND_MODE (hard_light)
-
-/*
- * Soft light
- * B(Dca, Da, Sca, Sa) =
- *   if (2.Sca <= Sa)
- *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
- *   otherwise if Dca.4 <= Da
- *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
- *   otherwise
- *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
- */
-static inline comp4_t
-blend_soft_light (comp4_t dca_org,
-		  comp4_t da_org,
-		  comp4_t sca_org,
-		  comp4_t sa_org)
-{
-    double dca = dca_org * (1.0 / MASK);
-    double da = da_org * (1.0 / MASK);
-    double sca = sca_org * (1.0 / MASK);
-    double sa = sa_org * (1.0 / MASK);
-    double rca;
-
-    if (2 * sca < sa)
-    {
-	if (da == 0)
-	    rca = dca * sa;
-	else
-	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
-    }
-    else if (da == 0)
-    {
-	rca = 0;
-    }
-    else if (4 * dca <= da)
-    {
-	rca = dca * sa +
-	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
-    }
-    else
-    {
-	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
-    }
-    return rca * MASK + 0.5;
-}
-
-PDF_SEPARABLE_BLEND_MODE (soft_light)
-
-/*
- * Difference
- * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
- */
-static inline comp4_t
-blend_difference (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t dcasa = dca * sa;
-    comp4_t scada = sca * da;
-
-    if (scada < dcasa)
-	return DIV_ONE_UNc (dcasa - scada);
-    else
-	return DIV_ONE_UNc (scada - dcasa);
-}
-
-PDF_SEPARABLE_BLEND_MODE (difference)
-
-/*
- * Exclusion
- * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
- */
-
-/* This can be made faster by writing it directly and not using
- * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
-
-static inline comp4_t
-blend_exclusion (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    return DIV_ONE_UNc (sca * da + dca * sa - 2 * dca * sca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (exclusion)
-
-#undef PDF_SEPARABLE_BLEND_MODE
-
-/*
- * PDF nonseperable blend modes are implemented using the following functions
- * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
- * and min value of the red, green and blue components.
- *
- * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
- *
- * clip_color (C):
- *   l = LUM (C)
- *   min = Cmin
- *   max = Cmax
- *   if n < 0.0
- *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
- *   if x > 1.0
- *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
- *   return C
- *
- * set_lum (C, l):
- *   d = l – LUM (C)
- *   C += d
- *   return clip_color (C)
- *
- * SAT (C) = CH_MAX (C) - CH_MIN (C)
- *
- * set_sat (C, s):
- *  if Cmax > Cmin
- *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
- *    Cmax = s
- *  else
- *    Cmid = Cmax = 0.0
- *  Cmin = 0.0
- *  return C
- */
-
-/* For premultiplied colors, we need to know what happens when C is
- * multiplied by a real number. LUM and SAT are linear:
- *
- *    LUM (r × C) = r × LUM (C)		SAT (r * C) = r * SAT (C)
- *
- * If we extend clip_color with an extra argument a and change
- *
- *        if x >= 1.0
- *
- * into
- *
- *        if x >= a
- *
- * then clip_color is also linear:
- *
- *    r * clip_color (C, a) = clip_color (r_c, ra);
- *
- * for positive r.
- *
- * Similarly, we can extend set_lum with an extra argument that is just passed
- * on to clip_color:
- *
- *   r * set_lum ( C, l, a)
- *
- *   = r × clip_color ( C + l - LUM (C), a)
- *
- *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
- *
- *   = set_lum ( r * C, r * l, r * a)
- *
- * Finally, set_sat:
- *
- *    r * set_sat (C, s) = set_sat (x * C, r * s)
- *
- * The above holds for all non-zero x, because the x'es in the fraction for
- * C_mid cancel out. Specifically, it holds for x = r:
- *
- *    r * set_sat (C, s) = set_sat (r_c, rs)
- *
- */
-
-/* So, for the non-separable PDF blend modes, we have (using s, d for
- * non-premultiplied colors, and S, D for premultiplied:
- *
- *   Color:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
- *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
- *
- *
- *   Luminosity:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
- *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
- *
- *
- *   Saturation:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
- *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
- *                                        a_s * LUM (D), a_s * a_d)
- *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
- *
- *   Hue:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
- *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
- *
- */
-
-#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
-#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
-#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
-#define SAT(c) (CH_MAX (c) - CH_MIN (c))
-
-#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
-    static void								\
-    combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t op,				\
-                            comp4_t *dest,				\
-			    const comp4_t *src,				\
-			    const comp4_t *mask,			\
-			    int width)					\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i)					\
-	{								\
-	    comp4_t s = combine_mask (src, mask, i);			\
-	    comp4_t d = *(dest + i);					\
-	    comp1_t sa = ALPHA_c (s);					\
-	    comp1_t isa = ~sa;						\
-	    comp1_t da = ALPHA_c (d);					\
-	    comp1_t ida = ~da;						\
-	    comp4_t result;						\
-	    comp4_t sc[3], dc[3], c[3];					\
-            								\
-	    result = d;							\
-	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
-	    dc[0] = RED_c (d);						\
-	    sc[0] = RED_c (s);						\
-	    dc[1] = GREEN_c (d);					\
-	    sc[1] = GREEN_c (s);					\
-	    dc[2] = BLUE_c (d);						\
-	    sc[2] = BLUE_c (s);						\
-	    blend_ ## name (c, dc, da, sc, sa);				\
-            								\
-	    *(dest + i) = result +					\
-		(DIV_ONE_UNc (sa * (comp4_t)da) << A_SHIFT) +		\
-		(DIV_ONE_UNc (c[0]) << R_SHIFT) +			\
-		(DIV_ONE_UNc (c[1]) << G_SHIFT) +			\
-		(DIV_ONE_UNc (c[2]));					\
-	}								\
-    }
-
-static void
-set_lum (comp4_t dest[3], comp4_t src[3], comp4_t sa, comp4_t lum)
-{
-    double a, l, min, max;
-    double tmp[3];
-
-    a = sa * (1.0 / MASK);
-
-    l = lum * (1.0 / MASK);
-    tmp[0] = src[0] * (1.0 / MASK);
-    tmp[1] = src[1] * (1.0 / MASK);
-    tmp[2] = src[2] * (1.0 / MASK);
-
-    l = l - LUM (tmp);
-    tmp[0] += l;
-    tmp[1] += l;
-    tmp[2] += l;
-
-    /* clip_color */
-    l = LUM (tmp);
-    min = CH_MIN (tmp);
-    max = CH_MAX (tmp);
-
-    if (min < 0)
-    {
-	if (l - min == 0.0)
-	{
-	    tmp[0] = 0;
-	    tmp[1] = 0;
-	    tmp[2] = 0;
-	}
-	else
-	{
-	    tmp[0] = l + (tmp[0] - l) * l / (l - min);
-	    tmp[1] = l + (tmp[1] - l) * l / (l - min);
-	    tmp[2] = l + (tmp[2] - l) * l / (l - min);
-	}
-    }
-    if (max > a)
-    {
-	if (max - l == 0.0)
-	{
-	    tmp[0] = a;
-	    tmp[1] = a;
-	    tmp[2] = a;
-	}
-	else
-	{
-	    tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
-	    tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
-	    tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
-	}
-    }
-
-    dest[0] = tmp[0] * MASK + 0.5;
-    dest[1] = tmp[1] * MASK + 0.5;
-    dest[2] = tmp[2] * MASK + 0.5;
-}
-
-static void
-set_sat (comp4_t dest[3], comp4_t src[3], comp4_t sat)
-{
-    int id[3];
-    comp4_t min, max;
-
-    if (src[0] > src[1])
-    {
-	if (src[0] > src[2])
-	{
-	    id[0] = 0;
-	    if (src[1] > src[2])
-	    {
-		id[1] = 1;
-		id[2] = 2;
-	    }
-	    else
-	    {
-		id[1] = 2;
-		id[2] = 1;
-	    }
-	}
-	else
-	{
-	    id[0] = 2;
-	    id[1] = 0;
-	    id[2] = 1;
-	}
-    }
-    else
-    {
-	if (src[0] > src[2])
-	{
-	    id[0] = 1;
-	    id[1] = 0;
-	    id[2] = 2;
-	}
-	else
-	{
-	    id[2] = 0;
-	    if (src[1] > src[2])
-	    {
-		id[0] = 1;
-		id[1] = 2;
-	    }
-	    else
-	    {
-		id[0] = 2;
-		id[1] = 1;
-	    }
-	}
-    }
-
-    max = dest[id[0]];
-    min = dest[id[2]];
-    if (max > min)
-    {
-	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
-	dest[id[0]] = sat;
-	dest[id[2]] = 0;
-    }
-    else
-    {
-	dest[0] = dest[1] = dest[2] = 0;
-    }
-}
-
-/*
- * Hue:
- * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
- */
-static inline void
-blend_hsl_hue (comp4_t c[3],
-               comp4_t dc[3],
-               comp4_t da,
-               comp4_t sc[3],
-               comp4_t sa)
-{
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_sat (c, c, SAT (dc) * sa);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
-
-/*
- * Saturation:
- * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
- */
-static inline void
-blend_hsl_saturation (comp4_t c[3],
-                      comp4_t dc[3],
-                      comp4_t da,
-                      comp4_t sc[3],
-                      comp4_t sa)
-{
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_sat (c, c, SAT (sc) * da);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
-
-/*
- * Color:
- * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
- */
-static inline void
-blend_hsl_color (comp4_t c[3],
-                 comp4_t dc[3],
-                 comp4_t da,
-                 comp4_t sc[3],
-                 comp4_t sa)
-{
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
-
-/*
- * Luminosity:
- * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
- */
-static inline void
-blend_hsl_luminosity (comp4_t c[3],
-                      comp4_t dc[3],
-                      comp4_t da,
-                      comp4_t sc[3],
-                      comp4_t sa)
-{
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_lum (c, c, sa * da, LUM (sc) * da);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
-
-#undef SAT
-#undef LUM
-#undef CH_MAX
-#undef CH_MIN
-#undef PDF_NON_SEPARABLE_BLEND_MODE
-
-/* All of the disjoint/conjoint composing functions
- *
- * The four entries in the first column indicate what source contributions
- * come from each of the four areas of the picture -- areas covered by neither
- * A nor B, areas covered only by A, areas covered only by B and finally
- * areas covered by both A and B.
- * 
- * Disjoint			Conjoint
- * Fa		Fb		Fa		Fb
- * (0,0,0,0)	0		0		0		0
- * (0,A,0,A)	1		0		1		0
- * (0,0,B,B)	0		1		0		1
- * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
- * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
- * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
- * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
- * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
- * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
- * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
- * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
- * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
- *
- * See  http://marc.info/?l=xfree-render&m=99792000027857&w=2  for more
- * information about these operators.
- */
-
-#define COMBINE_A_OUT 1
-#define COMBINE_A_IN  2
-#define COMBINE_B_OUT 4
-#define COMBINE_B_IN  8
-
-#define COMBINE_CLEAR   0
-#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
-#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
-#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
-#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
-#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
-#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
-#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
-
-/* portion covered by a but not b */
-static comp1_t
-combine_disjoint_out_part (comp1_t a, comp1_t b)
-{
-    /* min (1, (1-b) / a) */
-
-    b = ~b;                 /* 1 - b */
-    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
-	return MASK;        /* 1 */
-    return DIV_UNc (b, a);     /* (1-b) / a */
-}
-
-/* portion covered by both a and b */
-static comp1_t
-combine_disjoint_in_part (comp1_t a, comp1_t b)
-{
-    /* max (1-(1-b)/a,0) */
-    /*  = - min ((1-b)/a - 1, 0) */
-    /*  = 1 - min (1, (1-b)/a) */
-
-    b = ~b;                 /* 1 - b */
-    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
-	return 0;           /* 1 - 1 */
-    return ~DIV_UNc(b, a);    /* 1 - (1-b) / a */
-}
-
-/* portion covered by a but not b */
-static comp1_t
-combine_conjoint_out_part (comp1_t a, comp1_t b)
-{
-    /* max (1-b/a,0) */
-    /* = 1-min(b/a,1) */
-
-    /* min (1, (1-b) / a) */
-
-    if (b >= a)             /* b >= a -> b/a >= 1 */
-	return 0x00;        /* 0 */
-    return ~DIV_UNc(b, a);    /* 1 - b/a */
-}
-
-/* portion covered by both a and b */
-static comp1_t
-combine_conjoint_in_part (comp1_t a, comp1_t b)
-{
-    /* min (1,b/a) */
-
-    if (b >= a)             /* b >= a -> b/a >= 1 */
-	return MASK;        /* 1 */
-    return DIV_UNc (b, a);     /* b/a */
-}
-
-#define GET_COMP(v, i)   ((comp2_t) (comp1_t) ((v) >> i))
-
-#define ADD(x, y, i, t)							\
-    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
-     (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
-
-#define GENERIC(x, y, i, ax, ay, t, u, v)				\
-    ((t) = (MUL_UNc (GET_COMP (y, i), ay, (u)) +			\
-            MUL_UNc (GET_COMP (x, i), ax, (v))),			\
-     (comp4_t) ((comp1_t) ((t) |					\
-                           (0 - ((t) >> G_SHIFT)))) << (i))
-
-static void
-combine_disjoint_general_u (comp4_t *      dest,
-                            const comp4_t *src,
-                            const comp4_t *mask,
-                            int            width,
-                            comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t m, n, o, p;
-	comp2_t Fa, Fb, t, u, v;
-	comp1_t sa = s >> A_SHIFT;
-	comp1_t da = d >> A_SHIFT;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    Fa = combine_disjoint_out_part (sa, da);
-	    break;
-
-	case COMBINE_A_IN:
-	    Fa = combine_disjoint_in_part (sa, da);
-	    break;
-
-	case COMBINE_A:
-	    Fa = MASK;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    Fb = combine_disjoint_out_part (da, sa);
-	    break;
-
-	case COMBINE_B_IN:
-	    Fb = combine_disjoint_in_part (da, sa);
-	    break;
-
-	case COMBINE_B:
-	    Fb = MASK;
-	    break;
-	}
-	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
-	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
-	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
-	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
-	s = m | n | o | p;
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_disjoint_over_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp2_t a = s >> A_SHIFT;
-
-	if (s != 0x00)
-	{
-	    comp4_t d = *(dest + i);
-	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
-	    UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_disjoint_in_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               comp4_t *                dest,
-                               const comp4_t *          src,
-                               const comp4_t *          mask,
-                               int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_disjoint_out_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_disjoint_atop_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_disjoint_xor_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
-}
-
-static void
-combine_conjoint_general_u (comp4_t *      dest,
-                            const comp4_t *src,
-                            const comp4_t *mask,
-                            int            width,
-                            comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t m, n, o, p;
-	comp2_t Fa, Fb, t, u, v;
-	comp1_t sa = s >> A_SHIFT;
-	comp1_t da = d >> A_SHIFT;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    Fa = combine_conjoint_out_part (sa, da);
-	    break;
-
-	case COMBINE_A_IN:
-	    Fa = combine_conjoint_in_part (sa, da);
-	    break;
-
-	case COMBINE_A:
-	    Fa = MASK;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    Fb = combine_conjoint_out_part (da, sa);
-	    break;
-
-	case COMBINE_B_IN:
-	    Fb = combine_conjoint_in_part (da, sa);
-	    break;
-
-	case COMBINE_B:
-	    Fb = MASK;
-	    break;
-	}
-
-	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
-	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
-	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
-	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_conjoint_over_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
-}
-
-static void
-combine_conjoint_in_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               comp4_t *                dest,
-                               const comp4_t *          src,
-                               const comp4_t *          mask,
-                               int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_conjoint_out_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_conjoint_atop_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_conjoint_xor_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
-}
-
-/************************************************************************/
-/*********************** Per Channel functions **************************/
-/************************************************************************/
-
-static void
-combine_clear_ca (pixman_implementation_t *imp,
-                  pixman_op_t              op,
-                  comp4_t *                dest,
-                  const comp4_t *          src,
-                  const comp4_t *          mask,
-                  int                      width)
-{
-    memset (dest, 0, width * sizeof(comp4_t));
-}
-
-static void
-combine_src_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-
-	combine_mask_value_ca (&s, &m);
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_over_ca (pixman_implementation_t *imp,
-                 pixman_op_t              op,
-                 comp4_t *                dest,
-                 const comp4_t *          src,
-                 const comp4_t *          mask,
-                 int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	combine_mask_ca (&s, &m);
-
-	a = ~m;
-	if (a)
-	{
-	    comp4_t d = *(dest + i);
-	    UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
-	    s = d;
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_over_reverse_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t a = ~d >> A_SHIFT;
-
-	if (a)
-	{
-	    comp4_t s = *(src + i);
-	    comp4_t m = *(mask + i);
-
-	    UNcx4_MUL_UNcx4 (s, m);
-	    UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
-
-	    *(dest + i) = s;
-	}
-    }
-}
-
-static void
-combine_in_ca (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp2_t a = d >> A_SHIFT;
-	comp4_t s = 0;
-
-	if (a)
-	{
-	    comp4_t m = *(mask + i);
-
-	    s = *(src + i);
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UNcx4_MUL_UNc (s, a);
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_in_reverse_ca (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	combine_mask_alpha_ca (&s, &m);
-
-	a = m;
-	if (a != ~0)
-	{
-	    comp4_t d = 0;
-
-	    if (a)
-	    {
-		d = *(dest + i);
-		UNcx4_MUL_UNcx4 (d, a);
-	    }
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_out_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp2_t a = ~d >> A_SHIFT;
-	comp4_t s = 0;
-
-	if (a)
-	{
-	    comp4_t m = *(mask + i);
-
-	    s = *(src + i);
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UNcx4_MUL_UNc (s, a);
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_out_reverse_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	combine_mask_alpha_ca (&s, &m);
-
-	a = ~m;
-	if (a != ~0)
-	{
-	    comp4_t d = 0;
-
-	    if (a)
-	    {
-		d = *(dest + i);
-		UNcx4_MUL_UNcx4 (d, a);
-	    }
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_atop_ca (pixman_implementation_t *imp,
-                 pixman_op_t              op,
-                 comp4_t *                dest,
-                 const comp4_t *          src,
-                 const comp4_t *          mask,
-                 int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t ad;
-	comp2_t as = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = ~m;
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_atop_reverse_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t ad;
-	comp2_t as = ~d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = m;
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_xor_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t ad;
-	comp2_t as = ~d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = ~m;
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_add_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t d = *(dest + i);
-
-	combine_mask_value_ca (&s, &m);
-
-	UNcx4_ADD_UNcx4 (d, s);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_saturate_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     comp4_t *                dest,
-                     const comp4_t *          src,
-                     const comp4_t *          mask,
-                     int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s, d;
-	comp2_t sa, sr, sg, sb, da;
-	comp2_t t, u, v;
-	comp4_t m, n, o, p;
-
-	d = *(dest + i);
-	s = *(src + i);
-	m = *(mask + i);
-
-	combine_mask_ca (&s, &m);
-
-	sa = (m >> A_SHIFT);
-	sr = (m >> R_SHIFT) & MASK;
-	sg = (m >> G_SHIFT) & MASK;
-	sb =  m             & MASK;
-	da = ~d >> A_SHIFT;
-
-	if (sb <= da)
-	    m = ADD (s, d, 0, t);
-	else
-	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
-
-	if (sg <= da)
-	    n = ADD (s, d, G_SHIFT, t);
-	else
-	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
-
-	if (sr <= da)
-	    o = ADD (s, d, R_SHIFT, t);
-	else
-	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
-
-	if (sa <= da)
-	    p = ADD (s, d, A_SHIFT, t);
-	else
-	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
-
-	*(dest + i) = m | n | o | p;
-    }
-}
-
-static void
-combine_disjoint_general_ca (comp4_t *      dest,
-                             const comp4_t *src,
-                             const comp4_t *mask,
-                             int            width,
-                             comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s, d;
-	comp4_t m, n, o, p;
-	comp4_t Fa, Fb;
-	comp2_t t, u, v;
-	comp4_t sa;
-	comp1_t da;
-
-	s = *(src + i);
-	m = *(mask + i);
-	d = *(dest + i);
-	da = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	sa = m;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    m = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A_IN:
-	    m = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A:
-	    Fa = ~0;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    m = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B_IN:
-	    m = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B:
-	    Fb = ~0;
-	    break;
-	}
-	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
-	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
-	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
-	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_disjoint_over_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_disjoint_in_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_disjoint_out_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_disjoint_atop_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  comp4_t *                dest,
-                                  const comp4_t *          src,
-                                  const comp4_t *          mask,
-                                  int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_disjoint_xor_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
-}
-
-static void
-combine_conjoint_general_ca (comp4_t *      dest,
-                             const comp4_t *src,
-                             const comp4_t *mask,
-                             int            width,
-                             comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s, d;
-	comp4_t m, n, o, p;
-	comp4_t Fa, Fb;
-	comp2_t t, u, v;
-	comp4_t sa;
-	comp1_t da;
-
-	s = *(src + i);
-	m = *(mask + i);
-	d = *(dest + i);
-	da = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	sa = m;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    m = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A_IN:
-	    m = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A:
-	    Fa = ~0;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    m = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B_IN:
-	    m = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B:
-	    Fb = ~0;
-	    break;
-	}
-	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
-	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
-	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
-	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_conjoint_over_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  comp4_t *                dest,
-                                  const comp4_t *          src,
-                                  const comp4_t *          mask,
-                                  int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
-}
-
-static void
-combine_conjoint_in_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_conjoint_out_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_conjoint_atop_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  comp4_t *                dest,
-                                  const comp4_t *          src,
-                                  const comp4_t *          mask,
-                                  int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_conjoint_xor_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
-}
-
-void
-_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
-{
-    /* Unified alpha */
-    imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
-    imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
-    imp->combine_width[PIXMAN_OP_DST] = combine_dst;
-    imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
-    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
-    imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
-    imp->combine_width[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
-    imp->combine_width[PIXMAN_OP_OUT] = combine_out_u;
-    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
-    imp->combine_width[PIXMAN_OP_ATOP] = combine_atop_u;
-    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
-    imp->combine_width[PIXMAN_OP_XOR] = combine_xor_u;
-    imp->combine_width[PIXMAN_OP_ADD] = combine_add_u;
-    imp->combine_width[PIXMAN_OP_SATURATE] = combine_saturate_u;
-
-    /* Disjoint, unified */
-    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
-    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_DST] = combine_dst;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
-
-    /* Conjoint, unified */
-    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
-    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_DST] = combine_dst;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
-
-    imp->combine_width[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
-    imp->combine_width[PIXMAN_OP_SCREEN] = combine_screen_u;
-    imp->combine_width[PIXMAN_OP_OVERLAY] = combine_overlay_u;
-    imp->combine_width[PIXMAN_OP_DARKEN] = combine_darken_u;
-    imp->combine_width[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
-    imp->combine_width[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
-    imp->combine_width[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
-    imp->combine_width[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
-    imp->combine_width[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
-    imp->combine_width[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
-    imp->combine_width[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
-    imp->combine_width[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
-    imp->combine_width[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
-    imp->combine_width[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
-    imp->combine_width[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
-
-    /* Component alpha combiners */
-    imp->combine_width_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
-    imp->combine_width_ca[PIXMAN_OP_SRC] = combine_src_ca;
-    /* dest */
-    imp->combine_width_ca[PIXMAN_OP_OVER] = combine_over_ca;
-    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_IN] = combine_in_ca;
-    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_OUT] = combine_out_ca;
-    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
-    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_XOR] = combine_xor_ca;
-    imp->combine_width_ca[PIXMAN_OP_ADD] = combine_add_ca;
-    imp->combine_width_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
-
-    /* Disjoint CA */
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
-
-    /* Conjoint CA */
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
-
-    imp->combine_width_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
-    imp->combine_width_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
-    imp->combine_width_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
-    imp->combine_width_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
-    imp->combine_width_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
-    imp->combine_width_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
-    imp->combine_width_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
-    imp->combine_width_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
-    imp->combine_width_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
-    imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
-    imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
-
-    /* It is not clear that these make sense, so make them noops for now */
-    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
-}
-
deleted file mode 100644
--- a/gfx/cairo/libpixman/src/pixman-combine.h.template
+++ /dev/null
@@ -1,226 +0,0 @@
-
-#define COMPONENT_SIZE
-#define MASK
-#define ONE_HALF
-
-#define A_SHIFT
-#define R_SHIFT
-#define G_SHIFT
-#define A_MASK
-#define R_MASK
-#define G_MASK
-
-#define RB_MASK
-#define AG_MASK
-#define RB_ONE_HALF
-#define RB_MASK_PLUS_ONE
-
-#define ALPHA_c(x) ((x) >> A_SHIFT)
-#define RED_c(x) (((x) >> R_SHIFT) & MASK)
-#define GREEN_c(x) (((x) >> G_SHIFT) & MASK)
-#define BLUE_c(x) ((x) & MASK)
-
-/*
- * Helper macros.
- */
-
-#define MUL_UNc(a, b, t)						\
-    ((t) = (a) * (comp2_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
-
-#define DIV_UNc(a, b)							\
-    (((comp2_t) (a) * MASK + ((b) / 2)) / (b))
-
-#define ADD_UNc(x, y, t)				     \
-    ((t) = (x) + (y),					     \
-     (comp4_t) (comp1_t) ((t) | (0 - ((t) >> G_SHIFT))))
-
-#define DIV_ONE_UNc(x)							\
-    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
-
-/*
- * The methods below use some tricks to be able to do two color
- * components at the same time.
- */
-
-/*
- * x_rb = (x_rb * a) / 255
- */
-#define UNc_rb_MUL_UNc(x, a, t)						\
-    do									\
-    {									\
-	t  = ((x) & RB_MASK) * (a);					\
-	t += RB_ONE_HALF;						\
-	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
-	x &= RB_MASK;							\
-    } while (0)
-
-/*
- * x_rb = min (x_rb + y_rb, 255)
- */
-#define UNc_rb_ADD_UNc_rb(x, y, t)					\
-    do									\
-    {									\
-	t = ((x) + (y));						\
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
-	x = (t & RB_MASK);						\
-    } while (0)
-
-/*
- * x_rb = (x_rb * a_rb) / 255
- */
-#define UNc_rb_MUL_UNc_rb(x, a, t)					\
-    do									\
-    {									\
-	t  = (x & MASK) * (a & MASK);					\
-	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
-	t += RB_ONE_HALF;						\
-	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
-	x = t & RB_MASK;						\
-    } while (0)
-
-/*
- * x_c = (x_c * a) / 255
- */
-#define UNcx4_MUL_UNc(x, a)						\
-    do									\
-    {									\
-	comp4_t r1__, r2__, t__;					\
-									\
-	r1__ = (x);							\
-	UNc_rb_MUL_UNc (r1__, (a), t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	UNc_rb_MUL_UNc (r2__, (a), t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a) / 255 + y_c
- */
-#define UNcx4_MUL_UNc_ADD_UNcx4(x, a, y)				\
-    do									\
-    {									\
-	comp4_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (y) & RB_MASK;						\
-	UNc_rb_MUL_UNc (r1__, (a), t__);				\
-	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
-	UNc_rb_MUL_UNc (r2__, (a), t__);				\
-	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a + y_c * b) / 255
- */
-#define UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
-    do									\
-    {									\
-	comp4_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (y);							\
-	UNc_rb_MUL_UNc (r1__, (a), t__);				\
-	UNc_rb_MUL_UNc (r2__, (b), t__);				\
-	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
-									\
-	r2__ = ((x) >> G_SHIFT);					\
-	r3__ = ((y) >> G_SHIFT);					\
-	UNc_rb_MUL_UNc (r2__, (a), t__);				\
-	UNc_rb_MUL_UNc (r3__, (b), t__);				\
-	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a_c) / 255
- */
-#define UNcx4_MUL_UNcx4(x, a)						\
-    do									\
-    {									\
-	comp4_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (a);							\
-	UNc_rb_MUL_UNc_rb (r1__, r2__, t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	r3__ = (a) >> G_SHIFT;						\
-	UNc_rb_MUL_UNc_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a_c) / 255 + y_c
- */
-#define UNcx4_MUL_UNcx4_ADD_UNcx4(x, a, y)				\
-    do									\
-    {									\
-	comp4_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (a);							\
-	UNc_rb_MUL_UNc_rb (r1__, r2__, t__);				\
-	r2__ = (y) & RB_MASK;						\
-	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
-									\
-	r2__ = ((x) >> G_SHIFT);					\
-	r3__ = ((a) >> G_SHIFT);					\
-	UNc_rb_MUL_UNc_rb (r2__, r3__, t__);				\
-	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
-	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a_c + y_c * b) / 255
- */
-#define UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
-    do									\
-    {									\
-	comp4_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (a);							\
-	UNc_rb_MUL_UNc_rb (r1__, r2__, t__);				\
-	r2__ = (y);							\
-	UNc_rb_MUL_UNc (r2__, (b), t__);				\
-	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	r3__ = (a) >> G_SHIFT;						\
-	UNc_rb_MUL_UNc_rb (r2__, r3__, t__);				\
-	r3__ = (y) >> G_SHIFT;						\
-	UNc_rb_MUL_UNc (r3__, (b), t__);				\
-	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
-									\
-	x = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
-  x_c = min(x_c + y_c, 255)
-*/
-#define UNcx4_ADD_UNcx4(x, y)						\
-    do									\
-    {									\
-	comp4_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x) & RB_MASK;						\
-	r2__ = (y) & RB_MASK;						\
-	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
-									\
-	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
-	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
-	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
-									\
-	x = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
deleted file mode 100644
--- a/gfx/cairo/libpixman/src/pixman-combine16.c
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <math.h>
-#include <string.h>
-
-#include "pixman-private.h"
-
-#include "pixman-combine32.h"
-
-static force_inline uint32_t
-combine_mask (const uint32_t src, const uint32_t mask)
-{
-    uint32_t s, m;
-
-    m = mask >> A_SHIFT;
-
-    if (!m)
-	return 0;
-    s = src;
-
-    UN8x4_MUL_UN8 (s, m);
-
-    return s;
-}
-
-static void
-combine_src_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint32_t *               dest,
-               const uint32_t *         src,
-               const uint32_t *         mask,
-               int                      width)
-{
-    int i;
-
-    if (!mask)
-	memcpy (dest, src, width * sizeof (uint16_t));
-    else
-    {
-	uint16_t *d = (uint16_t*)dest;
-	uint16_t *src16 = (uint16_t*)src;
-	for (i = 0; i < width; ++i)
-	{
-	    if ((*mask & 0xff000000) == 0xff000000) {
-		// it's likely worth special casing
-		// fully opaque because it avoids
-		// the cost of conversion as well the multiplication
-		*(d + i) = *src16;
-	    } else {
-		// the mask is still 32bits
-		uint32_t s = combine_mask (convert_0565_to_8888(*src16), *mask);
-		*(d + i) = convert_8888_to_0565(s);
-	    }
-	    mask++;
-	    src16++;
-	}
-    }
-
-}
-
-static void
-combine_over_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
-               int                      width)
-{
-    int i;
-
-    if (!mask)
-	memcpy (dest, src, width * sizeof (uint16_t));
-    else
-    {
-	uint16_t *d = (uint16_t*)dest;
-	uint16_t *src16 = (uint16_t*)src;
-	for (i = 0; i < width; ++i)
-	{
-	    if ((*mask & 0xff000000) == 0xff000000) {
-		// it's likely worth special casing
-		// fully opaque because it avoids
-		// the cost of conversion as well the multiplication
-		*(d + i) = *src16;
-	    } else if ((*mask & 0xff000000) == 0x00000000) {
-		// keep the dest the same
-	    } else {
-		// the mask is still 32bits
-		uint32_t s = combine_mask (convert_0565_to_8888(*src16), *mask);
-		uint32_t ia = ALPHA_8 (~s);
-		uint32_t d32 = convert_0565_to_8888(*(d + i));
-		UN8x4_MUL_UN8_ADD_UN8x4 (d32, ia, s);
-		*(d + i) = convert_8888_to_0565(d32);
-	    }
-	    mask++;
-	    src16++;
-	}
-    }
-
-}
-
-
-void
-_pixman_setup_combiner_functions_16 (pixman_implementation_t *imp)
-{
-    int i;
-    for (i = 0; i < PIXMAN_N_OPERATORS; i++) {
-	imp->combine_16[i] = NULL;
-    }
-    imp->combine_16[PIXMAN_OP_SRC] = combine_src_u;
-    imp->combine_16[PIXMAN_OP_OVER] = combine_over_u;
-}
-
--- a/gfx/cairo/libpixman/src/pixman-combine32.c
+++ b/gfx/cairo/libpixman/src/pixman-combine32.c
@@ -137,41 +137,41 @@ combine_mask (const uint32_t *src, const
 	UN8x4_MUL_UN8 (s, m);
 
     return s;
 }
 
 static void
 combine_clear (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
-    memset (dest, 0, width * sizeof(uint32_t));
+    memset (dest, 0, width * sizeof (uint32_t));
 }
 
 static void
 combine_dst (pixman_implementation_t *imp,
 	     pixman_op_t	      op,
 	     uint32_t *		      dest,
 	     const uint32_t *	      src,
-	     const uint32_t *          mask,
+	     const uint32_t *         mask,
 	     int		      width)
 {
     return;
 }
 
 static void
 combine_src_u (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
     int i;
 
     if (!mask)
     {
 	memcpy (dest, src, width * sizeof (uint32_t));
     }
@@ -184,19 +184,19 @@ combine_src_u (pixman_implementation_t *
 	    *(dest + i) = s;
 	}
     }
 }
 
 static void
 combine_over_u (pixman_implementation_t *imp,
                 pixman_op_t              op,
-                uint32_t *                dest,
-                const uint32_t *          src,
-                const uint32_t *          mask,
+                uint32_t *               dest,
+                const uint32_t *         src,
+                const uint32_t *         mask,
                 int                      width)
 {
     int i;
 
     if (!mask)
     {
 	for (i = 0; i < width; ++i)
 	{
@@ -249,19 +249,19 @@ combine_over_u (pixman_implementation_t 
 	    }
 	}
     }
 }
 
 static void
 combine_over_reverse_u (pixman_implementation_t *imp,
                         pixman_op_t              op,
-                        uint32_t *                dest,
-                        const uint32_t *          src,
-                        const uint32_t *          mask,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
                         int                      width)
 {
     int i;
 
     for (i = 0; i < width; ++i)
     {
 	uint32_t s = combine_mask (src, mask, i);
 	uint32_t d = *(dest + i);
@@ -269,38 +269,38 @@ combine_over_reverse_u (pixman_implement
 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
 	*(dest + i) = s;
     }
 }
 
 static void
 combine_in_u (pixman_implementation_t *imp,
               pixman_op_t              op,
-              uint32_t *                dest,
-              const uint32_t *          src,
-              const uint32_t *          mask,
+              uint32_t *               dest,
+              const uint32_t *         src,
+              const uint32_t *         mask,
               int                      width)
 {
     int i;
 
     for (i = 0; i < width; ++i)
     {
 	uint32_t s = combine_mask (src, mask, i);
 	uint32_t a = ALPHA_8 (*(dest + i));
 	UN8x4_MUL_UN8 (s, a);
 	*(dest + i) = s;
     }
 }
 
 static void
 combine_in_reverse_u (pixman_implementation_t *imp,
                       pixman_op_t              op,
-                      uint32_t *                dest,
-                      const uint32_t *          src,
-                      const uint32_t *          mask,
+                      uint32_t *               dest,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
                       int                      width)
 {
     int i;
 
     for (i = 0; i < width; ++i)
     {
 	uint32_t s = combine_mask (src, mask, i);
 	uint32_t d = *(dest + i);
@@ -308,38 +308,38 @@ combine_in_reverse_u (pixman_implementat
 	UN8x4_MUL_UN8 (d, a);
 	*(dest + i) = d;
     }
 }
 
 static void
 combine_out_u (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
     int i;
 
     for (i = 0; i < width; ++i)
     {
 	uint32_t s = combine_mask (src, mask, i);
 	uint32_t a = ALPHA_8 (~*(dest + i));
 	UN8x4_MUL_UN8 (s, a);
 	*(dest + i) = s;
     }
 }
 
 static void
 combine_out_reverse_u (pixman_implementation_t *imp,
                        pixman_op_t              op,
-                       uint32_t *                dest,
-                       const uint32_t *          src,
-                       const uint32_t *          mask,
+                       uint32_t *               dest,
+                       const uint32_t *         src,
+                       const uint32_t *         mask,
                        int                      width)
 {
     int i;
 
     for (i = 0; i < width; ++i)
     {
 	uint32_t s = combine_mask (src, mask, i);
 	uint32_t d = *(dest + i);
@@ -347,19 +347,19 @@ combine_out_reverse_u (pixman_implementa
 	UN8x4_MUL_UN8 (d, a);
 	*(dest + i) = d;
     }
 }
 
 static void
 combine_atop_u (pixman_implementation_t *imp,
                 pixman_op_t              op,
-                uint32_t *                dest,
-                const uint32_t *          src,
-                const uint32_t *          mask,
+                uint32_t *               dest,
+                const uint32_t *         src,
+                const uint32_t *         mask,
                 int                      width)
 {
     int i;
 
     for (i = 0; i < width; ++i)
     {
 	uint32_t s = combine_mask (src, mask, i);
 	uint32_t d = *(dest + i);
@@ -369,19 +369,19 @@ combine_atop_u (pixman_implementation_t 
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
 	*(dest + i) = s;
     }
 }