Bug 1200684 - revise Skia row blits to preserve color when blended against itself. r=mchang
authorLee Salzman <lsalzman@mozilla.com>
Fri, 05 Aug 2016 19:50:49 -0400
changeset 397549 ffd7f9c1bbf9b6d8cc9e87a71e0f15fc0d2fefb4
parent 397548 aef42c5d23290400e0ba59afc8c95f43fd2b1b8a
child 397550 27c0dd5afde663dd19e094c022e8fb2149177940
push id25332
push usermaglione.k@gmail.com
push dateSat, 06 Aug 2016 21:21:51 +0000
reviewersmchang
bugs1200684
milestone51.0a1
Bug 1200684 - revise Skia row blits to preserve color when blended against itself. r=mchang MozReview-Commit-ID: Enz05s8vGuI
gfx/skia/skia/include/core/SkColorPriv.h
gfx/skia/skia/src/core/SkBlitRow_D16.cpp
gfx/skia/skia/src/core/SkBlitRow_D32.cpp
gfx/skia/skia/src/core/SkBlitter_A8.cpp
gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
gfx/skia/skia/src/core/SkSpriteBlitter_RGB16.cpp
gfx/skia/skia/src/opts/SkBlitRow_opts_SSE2.cpp
gfx/skia/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
gfx/skia/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
gfx/skia/skia/src/opts/SkColor_opts_SSE2.h
layout/reftests/async-scrolling/reftest.list
layout/reftests/border-radius/reftest.list
layout/reftests/svg/reftest.list
--- a/gfx/skia/skia/include/core/SkColorPriv.h
+++ b/gfx/skia/skia/include/core/SkColorPriv.h
@@ -195,16 +195,28 @@ static inline unsigned Sk255To256(U8CPU 
     return value + (value >> 7);
 }
 
 /** Multiplify value by 0..256, and shift the result down 8
     (i.e. return (value * alpha256) >> 8)
  */
 #define SkAlphaMul(value, alpha256)     (((value) * (alpha256)) >> 8)
 
+/** Calculates 256 - (value * alpha256) / 255 in range [0,256],
+ *  for [0,255] value and [0,256] alpha256.
+ */
+static inline U16CPU SkAlphaMulInv256(U16CPU value, U16CPU alpha256) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+    return SkAlpha255To256(255 - SkAlphaMul(value, alpha256));
+#else
+    unsigned prod = 0xFFFF - value * alpha256;
+    return (prod + (prod >> 8)) >> 8;
+#endif
+}
+
 //  The caller may want negative values, so keep all params signed (int)
 //  so we don't accidentally slip into unsigned math and lose the sign
 //  extension when we shift (in SkAlphaMul)
 static inline int SkAlphaBlend(int src, int dst, int scale256) {
     SkASSERT((unsigned)scale256 <= 256);
     return dst + SkAlphaMul(src - dst, scale256);
 }
 
@@ -563,23 +575,48 @@ static SK_ALWAYS_INLINE uint32_t SkAlpha
     uint32_t ag = ((c >> 8) & mask) * scale;
     return (rb & mask) | (ag & ~mask);
 }
 
 static inline SkPMColor SkPMSrcOver(SkPMColor src, SkPMColor dst) {
     return src + SkAlphaMulQ(dst, SkAlpha255To256(255 - SkGetPackedA32(src)));
 }
 
+/**
+ * Interpolates between colors src and dst using [0,256] scale.
+ */
+static inline SkPMColor SkPMLerp(SkPMColor src, SkPMColor dst, unsigned scale) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+    return SkAlphaMulQ(src, scale) + SkAlphaMulQ(dst, 256 - scale);
+#else
+    return SkFastFourByteInterp256(src, dst, scale);
+#endif
+}
+
 static inline SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa) {
     SkASSERT((unsigned)aa <= 255);
 
     unsigned src_scale = SkAlpha255To256(aa);
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
     unsigned dst_scale = SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale));
 
     return SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(dst, dst_scale);
+#else
+    unsigned dst_scale = SkAlphaMulInv256(SkGetPackedA32(src), src_scale);
+
+    const uint32_t mask = 0xFF00FF;
+
+    uint32_t src_rb = (src & mask) * src_scale;
+    uint32_t src_ag = ((src >> 8) & mask) * src_scale;
+
+    uint32_t dst_rb = (dst & mask) * dst_scale;
+    uint32_t dst_ag = ((dst >> 8) & mask) * dst_scale;
+
+    return (((src_rb + dst_rb) >> 8) & mask) | ((src_ag + dst_ag) & ~mask);
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Convert a 32bit pixel to a 16bit pixel (no dither)
 
 #define SkR32ToR16_MACRO(r)   ((unsigned)(r) >> (SK_R32_BITS - SK_R16_BITS))
 #define SkG32ToG16_MACRO(g)   ((unsigned)(g) >> (SK_G32_BITS - SK_G16_BITS))
 #define SkB32ToB16_MACRO(b)   ((unsigned)(b) >> (SK_B32_BITS - SK_B16_BITS))
--- a/gfx/skia/skia/src/core/SkBlitRow_D16.cpp
+++ b/gfx/skia/skia/src/core/SkBlitRow_D16.cpp
@@ -175,17 +175,17 @@ static void S32A_D565_Blend_Dither(uint1
         DITHER_565_SCAN(y);
         do {
             SkPMColor c = *src++;
             SkPMColorAssert(c);
             if (c)
             {
                 unsigned d = *dst;
                 int sa = SkGetPackedA32(c);
-                int dst_scale = SkAlpha255To256(255 - SkAlphaMul(sa, src_scale));
+                int dst_scale = SkAlphaMulInv256(sa, src_scale);
                 int dither = DITHER_VALUE(x);
 
                 int sr = SkGetPackedR32(c);
                 int sg = SkGetPackedG32(c);
                 int sb = SkGetPackedB32(c);
                 sr = SkDITHER_R32To565(sr, dither);
                 sg = SkDITHER_G32To565(sg, dither);
                 sb = SkDITHER_B32To565(sb, dither);
--- a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
+++ b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
@@ -21,35 +21,37 @@ static void S32_Opaque_BlitRow32(SkPMCol
 }
 
 static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
                                 const SkPMColor* SK_RESTRICT src,
                                 int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
     if (count > 0) {
         unsigned src_scale = SkAlpha255To256(alpha);
-        unsigned dst_scale = 256 - src_scale;
 
 #ifdef UNROLL
         if (count & 1) {
-            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
+            src += 1;
             dst += 1;
             count -= 1;
         }
 
         const SkPMColor* SK_RESTRICT srcEnd = src + count;
         while (src != srcEnd) {
-            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
+            src += 1;
             dst += 1;
-            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
+            src += 1;
             dst += 1;
         }
 #else
         do {
-            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
             src += 1;
             dst += 1;
         } while (--count > 0);
 #endif
     }
 }
 
 static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
--- a/gfx/skia/skia/src/core/SkBlitter_A8.cpp
+++ b/gfx/skia/skia/src/core/SkBlitter_A8.cpp
@@ -271,17 +271,17 @@ void SkA8_Shader_Blitter::blitH(int x, i
     }
 }
 
 static inline uint8_t aa_blend8(SkPMColor src, U8CPU da, int aa) {
     SkASSERT((unsigned)aa <= 255);
 
     int src_scale = SkAlpha255To256(aa);
     int sa = SkGetPackedA32(src);
-    int dst_scale = 256 - SkAlphaMul(sa, src_scale);
+    int dst_scale = SkAlphaMulInv256(sa, src_scale);
 
     return SkToU8((sa * src_scale + da * dst_scale) >> 8);
 }
 
 void SkA8_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
                                     const int16_t runs[]) {
     SkShader::Context* shaderContext = fShaderContext;
     SkXfermode*        mode = fXfermode;
--- a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
+++ b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
@@ -222,17 +222,21 @@ void SkARGB32_Blitter::blitV(int x, int 
 
     uint32_t* device = fDevice.writable_addr32(x, y);
     uint32_t  color = fPMColor;
 
     if (alpha != 255) {
         color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
     }
 
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
     unsigned dst_scale = 255 - SkGetPackedA32(color);
+#else
+    unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
+#endif
     size_t rowBytes = fDevice.rowBytes();
     while (--height >= 0) {
         device[0] = color + SkAlphaMulQ(device[0], dst_scale);
         device = (uint32_t*)((char*)device + rowBytes);
     }
 }
 
 void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
--- a/gfx/skia/skia/src/core/SkSpriteBlitter_RGB16.cpp
+++ b/gfx/skia/skia/src/core/SkSpriteBlitter_RGB16.cpp
@@ -25,17 +25,21 @@ static inline void D16_S32A_Blend_Pixel_
     unsigned sa = SkGetPackedA32(sc);
     unsigned dr, dg, db;
 
     if (255 == sa) {
         dr = SkAlphaBlend(SkPacked32ToR16(sc), SkGetPackedR16(dc), src_scale);
         dg = SkAlphaBlend(SkPacked32ToG16(sc), SkGetPackedG16(dc), src_scale);
         db = SkAlphaBlend(SkPacked32ToB16(sc), SkGetPackedB16(dc), src_scale);
     } else {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         unsigned dst_scale = 255 - SkAlphaMul(sa, src_scale);
+#else
+        unsigned dst_scale = SkAlphaMulInv256(sa, src_scale);
+#endif
         dr = (SkPacked32ToR16(sc) * src_scale + SkGetPackedR16(dc) * dst_scale) >> 8;
         dg = (SkPacked32ToG16(sc) * src_scale + SkGetPackedG16(dc) * dst_scale) >> 8;
         db = (SkPacked32ToB16(sc) * src_scale + SkGetPackedB16(dc) * dst_scale) >> 8;
     }
     *dst = SkPackRGB16(dr, dg, db);
 }
 
 #define D16_S32A_Blend_Pixel(dst, sc, src_scale) \
--- a/gfx/skia/skia/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/gfx/skia/skia/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -21,51 +21,46 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor*
                               const SkPMColor* SK_RESTRICT src,
                               int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
     if (count <= 0) {
         return;
     }
 
     uint32_t src_scale = SkAlpha255To256(alpha);
-    uint32_t dst_scale = 256 - src_scale;
 
     if (count >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
-            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
             src++;
             dst++;
             count--;
         }
 
         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
 
         while (count >= 4) {
             // Load 4 pixels each of src and dest.
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);
 
-            src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
-            dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
-
-            // Add result
-            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
+            __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
             _mm_store_si128(d, result);
             s++;
             d++;
             count -= 4;
         }
         src = reinterpret_cast<const SkPMColor*>(s);
         dst = reinterpret_cast<SkPMColor*>(d);
     }
 
     while (count > 0) {
-        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+        *dst = SkPMLerp(*src, *dst, src_scale);
         src++;
         dst++;
         count--;
     }
 }
 
 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                                const SkPMColor* SK_RESTRICT src,
--- a/gfx/skia/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/gfx/skia/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -904,17 +904,22 @@ void S32_Blend_BlitRow32_neon(SkPMColor*
         // Process src
         vsrc_wide = vmovl_u8(vsrc);
         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
 
         // Process dst
         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
 
         // Combine
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+        vdst_wide += vsrc_wide;
+        vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
         // Store
         vst1_u32(dst, vreinterpret_u32_u8(vres));
 
         src += 2;
         dst += 2;
         count -= 2;
     }
@@ -926,29 +931,34 @@ void S32_Blend_BlitRow32_neon(SkPMColor*
         // Load
         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
 
         // Process
         vsrc_wide = vmovl_u8(vsrc);
         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+        vdst_wide += vsrc_wide;
+        vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
         // Store
         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
     }
 }
 
 #ifdef SK_CPU_ARM32
 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
                          const SkPMColor* SK_RESTRICT src,
                          int count, U8CPU alpha) {
 
-    SkASSERT(255 >= alpha);
+    SkASSERT(255 > alpha);
 
     if (count <= 0) {
         return;
     }
 
     unsigned alpha256 = SkAlpha255To256(alpha);
 
     // First deal with odd counts
@@ -958,30 +968,33 @@ void S32A_Blend_BlitRow32_neon(SkPMColor
         unsigned dst_scale;
 
         // Load
         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
 
         // Calc dst_scale
         dst_scale = vget_lane_u8(vsrc, 3);
-        dst_scale *= alpha256;
-        dst_scale >>= 8;
-        dst_scale = 256 - dst_scale;
+        dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
 
         // Process src
         vsrc_wide = vmovl_u8(vsrc);
         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
 
         // Process dst
         vdst_wide = vmovl_u8(vdst);
         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
 
         // Combine
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+        vdst_wide += vsrc_wide;
+        vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
         dst++;
         src++;
         count--;
     }
 
     if (count) {
@@ -1002,30 +1015,46 @@ void S32A_Blend_BlitRow32_neon(SkPMColor
             vdst = vreinterpret_u8_u32(vld1_u32(dst));
 
             // Prepare src_scale
             vsrc_scale = vdupq_n_u16(alpha256);
 
             // Calc dst_scale
             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
             vdst_scale = vmovl_u8(vsrc_alphas);
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
             vdst_scale *= vsrc_scale;
             vdst_scale = vshrq_n_u16(vdst_scale, 8);
             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
+#else
+            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
+            // A 16-bit lane would overflow if we used 0xFFFF here,
+            // so use an approximation with 0xFF00 that is off by 1,
+            // and add back 1 after to get the correct value.
+            // This is valid if alpha256 <= 255.
+            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
+            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
+            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
+#endif
 
             // Process src
             vsrc_wide = vmovl_u8(vsrc);
             vsrc_wide *= vsrc_scale;
 
             // Process dst
             vdst_wide = vmovl_u8(vdst);
             vdst_wide *= vdst_scale;
 
             // Combine
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+            vdst_wide += vsrc_wide;
+            vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
             vst1_u32(dst, vreinterpret_u32_u8(vres));
 
             src += 2;
             dst += 2;
             count -= 2;
         } while(count);
     }
--- a/gfx/skia/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
+++ b/gfx/skia/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
@@ -784,19 +784,25 @@ static void S32_Blend_BlitRow32_mips_dsp
         "preceu.ph.qbr   %[t4],    %[t1]              \n\t"
         "preceu.ph.qbl   %[t5],    %[t1]              \n\t"
         "muleu_s.ph.qbr  %[t2],    %[t7],    %[t2]    \n\t"
         "muleu_s.ph.qbr  %[t3],    %[t7],    %[t3]    \n\t"
         "muleu_s.ph.qbr  %[t4],    %[t6],    %[t4]    \n\t"
         "muleu_s.ph.qbr  %[t5],    %[t6],    %[t5]    \n\t"
         "addiu           %[src],   %[src],   4        \n\t"
         "addiu           %[count], %[count], -1       \n\t"
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         "precrq.qb.ph    %[t0],    %[t3],    %[t2]    \n\t"
         "precrq.qb.ph    %[t2],    %[t5],    %[t4]    \n\t"
         "addu            %[t1],    %[t0],    %[t2]    \n\t"
+#else
+        "addu            %[t0],    %[t3],    %[t5]    \n\t"
+        "addu            %[t2],    %[t2],    %[t4]    \n\t"
+        "precrq.qb.ph    %[t1],    %[t0],    %[t2]    \n\t"
+#endif
         "sw              %[t1],    0(%[dst])          \n\t"
         "b               1b                           \n\t"
         " addi           %[dst],   %[dst],   4        \n\t"
     "2:                                               \n\t"
         ".set            pop                          \n\t"
         : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
           [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
           [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
--- a/gfx/skia/skia/src/opts/SkColor_opts_SSE2.h
+++ b/gfx/skia/skia/src/opts/SkColor_opts_SSE2.h
@@ -75,16 +75,52 @@ static inline __m128i SkAlphaMulQ_SSE2(c
 
     __m128i ag = _mm_andnot_si128(mask, c);
     ag = _mm_mulhi_epu16(ag, s);     // Alpha and green values are in the higher byte of each word.
     ag = _mm_andnot_si128(mask, ag);
 
     return _mm_or_si128(rb, ag);
 }
 
+// Portable version SkFastFourByteInterp256 is in SkColorPriv.h.
+static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m128i& dst, const unsigned src_scale) {
+    // Computes dst + (((src - dst)*src_scale)>>8)
+    const __m128i mask = _mm_set1_epi32(0x00FF00FF);
+
+    // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
+    __m128i src_rb = _mm_and_si128(mask, src);
+    __m128i src_ag = _mm_srli_epi16(src, 8);
+    __m128i dst_rb = _mm_and_si128(mask, dst);
+    __m128i dst_ag = _mm_srli_epi16(dst, 8);
+
+    // Compute scaled differences.
+    __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
+    __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
+    __m128i s = _mm_set1_epi16(src_scale);
+    diff_rb = _mm_mullo_epi16(diff_rb, s);
+    diff_ag = _mm_mullo_epi16(diff_ag, s);
+
+    // Pack the differences back together.
+    diff_rb = _mm_srli_epi16(diff_rb, 8);
+    diff_ag = _mm_andnot_si128(mask, diff_ag);
+    __m128i diff = _mm_or_si128(diff_rb, diff_ag);
+
+    // Add difference to destination.
+    return _mm_add_epi8(dst, diff);
+}
+
+// Portable version SkPMLerp is in SkColorPriv.h
+static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, const unsigned scale) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+    return _mm_add_epi8(SkAlphaMulQ_SSE2(src, scale), SkAlphaMulQ_SSE2(dst, 256 - scale));
+#else
+    return SkFastFourByteInterp256_SSE2(src, dst, scale);
+#endif
+}
+
 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
 #if SK_A32_SHIFT == 24                // It's very common (universal?) that alpha is the top byte.
     return _mm_srli_epi32(src, 24);   // You'd hope the compiler would remove the left shift then,
 #else                                 // but I've seen Clang just do a dumb left shift of zero. :(
     __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
     return _mm_srli_epi32(a, 24);
 #endif
 }
@@ -208,39 +244,62 @@ static inline __m128i SkPixel32ToPixel16
 
 // Portable version is SkPMSrcOver in SkColorPriv.h.
 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
     return _mm_add_epi32(src,
                          SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
                                                              SkGetPackedA32_SSE2(src))));
 }
 
-// Portable version is SkBlendARGB32 in SkColorPriv.h.
-static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
-                                         const __m128i& aa) {
-    __m128i src_scale = SkAlpha255To256_SSE2(aa);
-    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
-    __m128i dst_scale = SkGetPackedA32_SSE2(src);
-    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
-    dst_scale = _mm_srli_epi16(dst_scale, 8);
-    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
-
-    __m128i result = SkAlphaMulQ_SSE2(src, src_scale);
-    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
-}
-
 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
                                          const unsigned aa) {
     unsigned alpha = SkAlpha255To256(aa);
-    __m128i src_scale = _mm_set1_epi32(alpha);
-    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+     __m128i src_scale = _mm_set1_epi32(alpha);
+     // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+     __m128i dst_scale = SkGetPackedA32_SSE2(src);
+     dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
+     dst_scale = _mm_srli_epi16(dst_scale, 8);
+     dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
+
+     __m128i result = SkAlphaMulQ_SSE2(src, alpha);
+     return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+#else
+    __m128i src_scale = _mm_set1_epi16(alpha);
+    // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
     __m128i dst_scale = SkGetPackedA32_SSE2(src);
+    // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
     dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
-    dst_scale = _mm_srli_epi16(dst_scale, 8);
-    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
+    dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
+    dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
+    dst_scale = _mm_srli_epi32(dst_scale, 8);
+    // Duplicate scales into 2x16-bit pattern per pixel.
+    dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
+    dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
+
+    const __m128i mask = _mm_set1_epi32(0x00FF00FF);
+
+    // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
+    __m128i src_rb = _mm_and_si128(mask, src);
+    __m128i src_ag = _mm_srli_epi16(src, 8);
+    __m128i dst_rb = _mm_and_si128(mask, dst);
+    __m128i dst_ag = _mm_srli_epi16(dst, 8);
 
-    __m128i result = SkAlphaMulQ_SSE2(src, alpha);
-    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+    // Scale them.
+    src_rb = _mm_mullo_epi16(src_rb, src_scale);
+    src_ag = _mm_mullo_epi16(src_ag, src_scale);
+    dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
+    dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
+
+    // Add the scaled source and destination.
+    dst_rb = _mm_add_epi16(src_rb, dst_rb);
+    dst_ag = _mm_add_epi16(src_ag, dst_ag);
+
+    // Unsplay the halves back together.
+    dst_rb = _mm_srli_epi16(dst_rb, 8);
+    dst_ag = _mm_andnot_si128(mask, dst_ag);
+    return _mm_or_si128(dst_rb, dst_ag);
+#endif
 }
 
 #undef ASSERT_EQ
 #endif // SkColor_opts_SSE2_DEFINED
--- a/layout/reftests/async-scrolling/reftest.list
+++ b/layout/reftests/async-scrolling/reftest.list
@@ -23,17 +23,17 @@ fuzzy-if(Android,5,4) skip-if(!asyncPan)
 skip-if(!asyncPan) == split-layers-1.html split-layers-1-ref.html
 skip-if(!asyncPan) == split-layers-multi-scrolling-1.html split-layers-multi-scrolling-1-ref.html
 fuzzy-if(skiaContent,2,240000) fuzzy-if(browserIsRemote&&!skiaContent&&(cocoaWidget||winWidget),1,240000) skip-if(!asyncPan) == split-opacity-layers-1.html split-opacity-layers-1-ref.html
 skip-if(!asyncPan) == sticky-pos-scrollable-1.html sticky-pos-scrollable-1-ref.html
 skip-if(!asyncPan) == fixed-pos-scrollable-1.html fixed-pos-scrollable-1-ref.html
 skip-if(!asyncPan) == culling-1.html culling-1-ref.html
 skip-if(!asyncPan) == position-fixed-iframe-1.html position-fixed-iframe-1-ref.html
 skip-if(!asyncPan) == position-fixed-iframe-2.html position-fixed-iframe-2-ref.html
-fuzzy-if(skiaContent||(browserIsRemote&&cocoaWidget),1,10000) skip-if(!asyncPan) == position-fixed-in-scroll-container.html position-fixed-in-scroll-container-ref.html
+fuzzy-if(skiaContent,1,11300) skip-if(!asyncPan) == position-fixed-in-scroll-container.html position-fixed-in-scroll-container-ref.html
 skip-if(!asyncPan) == position-fixed-inside-sticky-1.html position-fixed-inside-sticky-1-ref.html
 fuzzy(1,60000) skip-if(!asyncPan) == group-opacity-surface-size-1.html group-opacity-surface-size-1-ref.html
 skip-if(!asyncPan) == position-sticky-transformed.html position-sticky-transformed-ref.html
 skip-if(!asyncPan) == offscreen-prerendered-active-opacity.html offscreen-prerendered-active-opacity-ref.html
 fuzzy-if(Android,6,4) skip-if(!asyncPan) == offscreen-clipped-blendmode-1.html offscreen-clipped-blendmode-ref.html
 fuzzy-if(Android,6,4) skip-if(!asyncPan) == offscreen-clipped-blendmode-2.html offscreen-clipped-blendmode-ref.html
 fuzzy-if(Android,6,4) skip == offscreen-clipped-blendmode-3.html offscreen-clipped-blendmode-ref.html # bug 1251588 - wrong AGR on mix-blend-mode item
 fuzzy-if(Android,6,4) skip-if(!asyncPan) == offscreen-clipped-blendmode-4.html offscreen-clipped-blendmode-ref.html
--- a/layout/reftests/border-radius/reftest.list
+++ b/layout/reftests/border-radius/reftest.list
@@ -39,30 +39,30 @@ fuzzy-if(skiaContent,1,2728) == corner-4
 fails == clipping-1.html clipping-1-ref.html # background color should completely fill box; bug 466572
 != clipping-2.html about:blank # background color clipped to inner/outer border, can't get
 # great tests for this due to antialiasing problems described in bug 466572
 fuzzy-if(skiaContent,1,13) == clipping-3.html clipping-3-ref.xhtml # edge of border-radius clips an underlying object's background
 
 # Tests for clipping the contents of replaced elements and overflow!=visible
 != clipping-4-ref.html clipping-4-notref.html
 fuzzy-if(true,1,20) fuzzy-if(d2d,64,196) fuzzy-if(cocoaWidget,1,180) fuzzy-if(Android,140,237) == clipping-4-canvas.html clipping-4-ref.html # bug 732535
-fuzzy-if(Android,5,54) fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,10) == clipping-4-image.html clipping-4-ref.html
+fuzzy-if(Android,5,54) fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,10) fuzzy-if(skiaContent,1,140) == clipping-4-image.html clipping-4-ref.html
 fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,10) fuzzy-if(skiaContent,1,77) == clipping-4-overflow-hidden.html clipping-4-ref.html
 == clipping-5-canvas.html clipping-5-refc.html
 fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,5) == clipping-5-image.html clipping-5-refi.html
 fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,5) fuzzy-if(skiaContent,1,77) == clipping-5-overflow-hidden.html clipping-5-ref.html
 fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,5) fuzzy-if(Android,5,21) fuzzy-if(skiaContent,1,77) == clipping-5-refi.html clipping-5-ref.html
 fuzzy-if(true,1,7) fuzzy-if(d2d,48,94) fuzzy-if(cocoaWidget,1,99) fuzzy-if(Android,99,115) fuzzy-if(skiaContent,1,77) == clipping-5-refc.html clipping-5-ref.html # bug 732535
 fuzzy-if(winWidget,105,71) fuzzy-if(cocoaWidget,1,1) fuzzy-if(Android,8,469) == clipping-6.html clipping-6-ref.html # PaintedLayer and MaskLayer with transforms that aren't identical
 fuzzy-if(true,9,40) fuzzy-if(d2d,46,52) fuzzy-if(Android,255,586) fuzzy-if(skiaContent,19,36) == clipping-7.html clipping-7-ref.html # ColorLayer and MaskLayer with transforms that aren't identical. Reference image rendered without using layers (which causes fuzzy failures).
 fuzzy-if(/^Windows\x20NT\x206\.2/.test(http.oscpu),1,5) == clipping-and-zindex-1.html clipping-and-zindex-1-ref.html
 fuzzy-if(cocoaWidget,1,4) == intersecting-clipping-1-canvas.html intersecting-clipping-1-refc.html
 == intersecting-clipping-1-image.html intersecting-clipping-1-refi.html
 == intersecting-clipping-1-overflow-hidden.html intersecting-clipping-1-ref.html
-fuzzy-if(Android,5,105) fuzzy-if(d2d,1,20) fuzzy-if(skiaContent,1,135) == intersecting-clipping-1-refi.html intersecting-clipping-1-ref.html
+fuzzy-if(Android,5,105) fuzzy-if(d2d,1,20) fuzzy-if(skiaContent,1,250) == intersecting-clipping-1-refi.html intersecting-clipping-1-ref.html
 fuzzy-if(true,1,33) fuzzy-if(d2d,48,350) fuzzy-if(cocoaWidget,1,332) fuzzy-if(Android,124,440) fuzzy-if(skiaContent,1,135) == intersecting-clipping-1-refc.html intersecting-clipping-1-ref.html # bug 732535
 
 # Inheritance
 == inherit-1.html inherit-1-ref.html # border-radius shouldn't inherit
 
 # Table elements
 == table-collapse-1.html table-collapse-1-ref.html # border-radius is ignored on internal table elements
 # when border-collapse: collapse
--- a/layout/reftests/svg/reftest.list
+++ b/layout/reftests/svg/reftest.list
@@ -27,17 +27,17 @@ include svg-integration/reftest.list
 
 == baseline-middle-01.svg pass.svg
 == border-radius-01.html pass.svg
 == cssComment-in-attribute-01.svg cssComment-in-attribute-01-ref.svg
 == clip-01.svg pass.svg
 == clip-02a.svg clip-02-ref.svg
 == clip-02b.svg clip-02-ref.svg
 == clipPath-advanced-01.svg pass.svg
-fuzzy-if(/^Windows\x20NT\x2010\.0/.test(http.oscpu)||/^Windows\x20NT\x206\.[12]/.test(http.oscpu),1,5) fuzzy-if(azureQuartz,1,6) fuzzy-if(OSX,1,6) fuzzy-if(skiaContent,1,300) == clipPath-and-shape-rendering-01.svg clipPath-and-shape-rendering-01-ref.svg # bug 614840
+fuzzy-if(/^Windows\x20NT\x2010\.0/.test(http.oscpu)||/^Windows\x20NT\x206\.[12]/.test(http.oscpu),1,5) fuzzy-if(azureQuartz,1,6) fuzzy-if(OSX,1,6) fuzzy-if(skiaContent,1,630) == clipPath-and-shape-rendering-01.svg clipPath-and-shape-rendering-01-ref.svg # bug 614840
 == clipPath-and-transform-01.svg pass.svg
 == clipPath-basic-01.svg pass.svg
 == clipPath-basic-02.svg pass.svg
 == clipPath-basic-03.svg pass.svg
 == clipPath-basic-04.svg pass.svg
 == clipPath-basic-05.svg pass.svg
 == clipPath-basic-06.svg pass.svg
 == clipPath-basic-07.svg pass.svg
@@ -162,17 +162,17 @@ fuzzy-if(skiaContent,1,2) == fallback-co
 fails-if(Android||B2G) pref(security.fileuri.strict_origin_policy,true) == filter-extref-differentOrigin-01.svg pass.svg # Bug 695385
 == filter-foreignObject-01.svg pass.svg
 == filter-in-mask-01.svg pass.svg
 == filter-invalidation-01.svg pass.svg
 == filter-result-01.svg filter-result-01-ref.svg
 == filter-scaled-01.svg pass.svg
 fuzzy-if(skiaContent,1,500) == filter-scaled-02.html filter-scaled-02-ref.html
 == filter-translated-01.svg filter-translated-01-ref.svg
-fuzzy-if(skiaContent,1,2500) == filters-and-group-opacity-01.svg filters-and-group-opacity-01-ref.svg
+fuzzy-if(skiaContent,1,800000) == filters-and-group-opacity-01.svg filters-and-group-opacity-01-ref.svg
 == foreignObject-01.svg pass.svg
 == foreignObject-02.svg foreignObject-02-ref.svg
 == foreignObject-ancestor-style-change-01.svg foreignObject-ancestor-style-change-01-ref.svg
 == foreignObject-change-transform-01.svg pass.svg
 == foreignObject-display-01.svg pass.svg
 == foreignObject-form-theme.svg foreignObject-form-theme-ref.html
 == foreignObject-img-form-theme.html foreignObject-img-form-theme-ref.html
 == foreignObject-move-repaint-01.svg pass.svg
@@ -196,26 +196,26 @@ fuzzy-if(skiaContent,1,550) == import-sv
 == invalid-text-01.svg pass.svg
 == lang-attribute-01.svg pass.svg
 == lang-attribute-02.svg pass.svg
 == lang-attribute-03.svg pass.svg
 == linearGradient-basic-01.svg pass.svg
 == linearGradient-basic-02.svg pass.svg
 # off-by-one fuzziness expected. OS X is broken with bad aliasing though (bug 1023640).
 fuzzy-if(cocoaWidget,15,19679) fuzzy-if(winWidget,1,8800) fuzzy-if(!cocoaWidget&&!winWidget,1,4000) fuzzy-if(skiaContent,1,5000) == linearGradient-basic-03.svg linearGradient-basic-03-ref.svg
-fuzzy-if(skiaContent,1,20000) == markers-and-group-opacity-01.svg markers-and-group-opacity-01-ref.svg
+fuzzy-if(skiaContent,1,800000) == markers-and-group-opacity-01.svg markers-and-group-opacity-01-ref.svg
 == marker-attribute-01.svg pass.svg
 == marker-effects-01.svg marker-effects-01-ref.svg
 fuzzy-if(skiaContent,1,100) == marker-viewBox-01.svg marker-viewBox-01-ref.svg
 fuzzy-if(skiaContent,1,100) == marker-orientation-01.svg marker-orientation-01-ref.svg
 fuzzy-if(skiaContent,1,5) pref(svg.marker-improvements.enabled,true) == marker-orientation-02.svg marker-orientation-02-ref.svg
 == marker-orientation-03.svg pass.svg
 == marker-orientation-04.svg pass.svg
 == mask-basic-01.svg pass.svg
-== mask-basic-02.svg mask-basic-02-ref.svg
+fuzzy-if(skiaContent,1,10000) == mask-basic-02.svg mask-basic-02-ref.svg
 == mask-basic-03.svg pass.svg
 == mask-basic-04.svg pass.svg
 == mask-extref-dataURI-01.svg pass.svg
 == mask-containing-masked-content-01.svg pass.svg
 == mask-transformed-01.svg mask-transformed-01-ref.svg
 == mask-transformed-02.svg pass.svg
 == mask-transformed-child-01.svg mask-transformed-child-01-ref.svg
 # fuzzy because of the differences between clipPath and mask clipping