Bug 1541350 - optimize SkBlitRow::Color32 for SSE2. r=jrmuizel
☠☠ backed out by 39655232be1d ☠ ☠
authorLee Salzman <lsalzman@mozilla.com>
Wed, 10 Apr 2019 14:23:36 +0000
changeset 468793 68df166b4d9e53aaa1ff1e39ea6f3e534814f891
parent 468792 59c870edc6771c3de626e692bd6db1501a58e0e9
child 468794 323eef8fd5ff8d4968c77fb6c77d65b010b31ad6
push id112755
push userdvarga@mozilla.com
push dateWed, 10 Apr 2019 22:06:41 +0000
treeherdermozilla-inbound@606f85641d0b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs1541350
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1541350 - optimize SkBlitRow::Color32 for SSE2. r=jrmuizel Differential Revision: https://phabricator.services.mozilla.com/D26346
gfx/skia/skia/src/core/SkBlitRow_D32.cpp
--- a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
+++ b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
@@ -311,15 +311,42 @@ void SkBlitRow::Color32(SkPMColor dst[],
         case   0: memmove(dst, src, count * sizeof(SkPMColor)); return;
         case 255: sk_memset32(dst, color, count);               return;
     }
 
     unsigned invA = 255 - SkGetPackedA32(color);
     invA += invA >> 7;
     SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    // The generic implementation computes (src*alpha + (color<<8) + 128) >> 8.
+    // Rewriting this as ((alpha * ((src<<8) + 0x80FF/alpha)) >> 16) + color
+    // can use _mm_mulhi to remove the final shift and _mm_unpack to remove the
+    // initial src shift/bias. This ultimately shaves off 2 insns from the inner
+    // loop. It returns the same results in all cases.
+    unsigned bias = 0x80FF / invA;
+    // Split up the bias so the low part can be cheaply interleaved.
+    __m128i biasLo = _mm_set1_epi8(bias & 0xFF);
+    __m128i biasHi = _mm_set1_epi8(bias >> 8);
+    __m128i invA8 = _mm_set1_epi16(invA);
+    __m128i color4 = _mm_set1_epi32(color);
+
+    Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
+        // Add the high bits of the bias.
+        __m128i srcBiasHi = _mm_add_epi8(src4.fVec, biasHi);
+        // Shift the source and interleave in the low bits of the bias.
+        __m128i lo = _mm_unpacklo_epi8(biasLo, srcBiasHi);
+        __m128i hi = _mm_unpackhi_epi8(biasLo, srcBiasHi);
+        // Multiply the biased source by the alpha, keeping only the high part.
+        lo = _mm_mulhi_epu16(lo, invA8);
+        hi = _mm_mulhi_epu16(hi, invA8);
+        // Narrow/combine the results and add the blended color.
+        return Sk4px(_mm_add_epi8(_mm_packus_epi16(lo, hi), color4));
+    });
+#else
     Sk16h colorHighAndRound = (Sk4px::DupPMColor(color).widen() << 8) + Sk16h(128);
     Sk16b invA_16x(invA);
 
     Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
         return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
     });
+#endif
 }