Bug 1635616 - shuffling optimizations for SWGL x86 bilinear filtering code. r=jimb
authorLee Salzman <lsalzman@mozilla.com>
Fri, 08 May 2020 09:58:23 +0000
changeset 528803 81c36b4b79c6fd75e807b20e8dca2e1821ff4624
parent 528802 97e4cf6414760ae2db49b7ab00f181b6ef6a4ee0
child 528804 7a34f25d1dc85abd5684002f79725f8d495d1a44
push id37396
push userncsoregi@mozilla.com
push dateFri, 08 May 2020 15:58:04 +0000
treeherdermozilla-central@ac7a5cda729f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjimb
bugs1635616
milestone78.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1635616 - shuffling optimizations for SWGL x86 bilinear filtering code. r=jimb Differential Revision: https://phabricator.services.mozilla.com/D73979
gfx/wr/swgl/src/glsl.h
--- a/gfx/wr/swgl/src/glsl.h
+++ b/gfx/wr/swgl/src/glsl.h
@@ -2463,75 +2463,113 @@ template <typename S>
 vec4 textureLinearRGBA8(S sampler, vec2 P, I32 zoffset = 0) {
   assert(sampler->format == TextureFormat::RGBA8);
 
 #if USE_SSE2
   ivec2 i(linearQuantize(P, 256, sampler));
   ivec2 frac = i & (I32)0xFF;
   i >>= 8;
 
-  __m128i row0 = _mm_min_epi16(_mm_max_epi16(i.y, _mm_setzero_si128()),
-                               _mm_set1_epi32(sampler->height - 1));
-  row0 = _mm_madd_epi16(row0, _mm_set1_epi32(sampler->stride));
-  row0 =
-      _mm_add_epi32(row0, _mm_min_epi16(_mm_max_epi16(i.x, _mm_setzero_si128()),
-                                        _mm_set1_epi32(sampler->width - 1)));
+  // Pack coords so they get clamped into range, and also for later bounding
+  // of fractional coords. Store Y as low-bits for easier access, X as high.
+  __m128i yx = _mm_packs_epi32(i.y, i.x);
+  __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1),
+                               _mm_set1_epi32(sampler->width - 1));
+  // Clamp coords to valid range to prevent sampling outside texture.
+  __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw);
+  // Multiply clamped Y by stride and add X offset.
+  __m128i row0 = _mm_madd_epi16(
+      _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()),
+      _mm_set1_epi16(sampler->stride));
+  row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128()));
+  // Add in layer offset if available
   row0 = _mm_add_epi32(row0, zoffset);
 
-  if (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_or_si128(frac.x, frac.y),
-                                       _mm_setzero_si128())) == 0xFFFF) {
+  // Check if fractional coords are all zero, in which case skip filtering.
+  __m128i fracyx = _mm_packs_epi32(frac.y, frac.x);
+  if (!_mm_movemask_epi8(_mm_cmpgt_epi16(fracyx, _mm_setzero_si128()))) {
     return fetchOffsetsRGBA8(sampler, row0);
   }
 
-  __m128i yinside = _mm_andnot_si128(
-      _mm_cmplt_epi32(i.y, _mm_setzero_si128()),
-      _mm_cmplt_epi32(i.y, _mm_set1_epi32(sampler->height - 1)));
-  __m128i row1 = _mm_and_si128(yinside, _mm_set1_epi32(sampler->stride));
-
-  __m128i xinside = _mm_andnot_si128(
-      _mm_cmplt_epi32(i.x, _mm_setzero_si128()),
-      _mm_cmplt_epi32(i.x, _mm_set1_epi32(sampler->width - 1)));
-  __m128i fracx = _mm_and_si128(xinside, frac.x);
-  fracx = _mm_shufflelo_epi16(fracx, _MM_SHUFFLE(2, 2, 0, 0));
-  fracx = _mm_shufflehi_epi16(fracx, _MM_SHUFFLE(2, 2, 0, 0));
-  fracx = _mm_slli_epi16(fracx, 4);
-
-  __m128i fracy = _mm_or_si128(_mm_slli_epi32(frac.y, 16),
-                               _mm_sub_epi32(_mm_set1_epi32(256), frac.y));
-
+  // Check if coords were clamped at all above. If so, need to adjust fractions
+  // to avoid sampling outside the texture on the edges.
+  __m128i yxinside = _mm_andnot_si128(
+      _mm_cmplt_epi16(yx, _mm_setzero_si128()),
+      _mm_cmplt_epi16(yx, hw));
+  // Set fraction to zero when outside.
+  fracyx = _mm_and_si128(fracyx, yxinside);
+  // Store two side-by-side copies of X fraction, as below each pixel value
+  // will be interleaved to be next to the pixel value for the next row.
+  __m128i fracx = _mm_unpackhi_epi16(fracyx, fracyx);
+  // For Y fraction, we need to store 1-fraction before each fraction, as a
+  // madd will be used to weight and collapse all results as last step.
+  __m128i fracy = _mm_unpacklo_epi16(
+      _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx);
+
+  // Ensure we don't sample row off end of texture from added stride.
+  __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride));
+
+  // Load two adjacent pixels on each row and interleave them.
   // r0,g0,b0,a0,r1,g1,b1,a1 \/ R0,G0,B0,A0,R1,G1,B1,A1
   // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1
-  // r0_,R0_,g0_,G0_,b0_,B0_,a0_,A0_ /\ r1_,R1_,g1_,G1_,b1_,B1_,a1_,A1_
-  // (r0*(256-fracy) + R0*fracy), ...
-  __m128 r0, r1, r2, r3;
-#  define FILTER_LANE(out, idx)                                               \
-    {                                                                         \
-      uint32_t* buf = &sampler->buf[_mm_cvtsi128_si32(                        \
-          _mm_shuffle_epi32(row0, _MM_SHUFFLE(idx, idx, idx, idx)))];         \
-      __m128i cc = _mm_unpacklo_epi8(                                         \
-          _mm_loadl_epi64((__m128i*)buf),                                     \
-          _mm_loadl_epi64(                                                    \
-              (__m128i*)(buf + _mm_extract_epi16(row1, 2 * idx))));           \
-      __m128i cc0 = _mm_unpacklo_epi8(cc, _mm_setzero_si128());               \
-      __m128i cc1 = _mm_unpackhi_epi8(cc, _mm_setzero_si128());               \
-      cc = _mm_add_epi16(                                                     \
-          cc0, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(cc1, cc0), 4),    \
-                               _mm_shuffle_epi32(                             \
-                                   fracx, _MM_SHUFFLE(idx, idx, idx, idx)))); \
-      out = _mm_cvtepi32_ps(_mm_madd_epi16(                                   \
-          cc, _mm_shuffle_epi32(fracy, _MM_SHUFFLE(idx, idx, idx, idx))));    \
+#  define LOAD_LANE(out, idx)                                               \
+    {                                                                       \
+      uint32_t* buf = &sampler->buf[_mm_cvtsi128_si32(                      \
+          _mm_shuffle_epi32(row0, _MM_SHUFFLE(idx, idx, idx, idx)))];       \
+      out = _mm_unpacklo_epi8(                                              \
+          _mm_loadl_epi64((__m128i*)buf),                                   \
+          _mm_loadl_epi64((__m128i*)(buf + _mm_extract_epi16(row1, idx)))); \
     }
-  FILTER_LANE(r0, 0);
-  FILTER_LANE(r1, 1);
-  FILTER_LANE(r2, 2);
-  FILTER_LANE(r3, 3);
-#  undef FILTER_LANE
-
-  _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
-  return vec4(r2, r1, r0, r3) * (1.0f / 0xFF00);
+  __m128i x, y, z, w;
+  LOAD_LANE(x, 0)
+  LOAD_LANE(y, 1)
+  LOAD_LANE(z, 2)
+  LOAD_LANE(w, 3)
+#  undef LOAD_LANE
+
+  // Need to transpose the data from AoS to SoA format. Best to do this here
+  // while the data is still packed into 8-bit components, requiring fewer
+  // insns.
+  // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1 \/
+  // r2,R2,g2,G2,b2,B2,a2,A2,r3,R3,g3,G3,b3,B3,a3,A3
+  // ... r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2
+  // ... r1,R1,r3,R3,g1,G1,g3,G3,b1,B1,b3,B3,a1,A1,a3,A3
+  __m128i xy0 = _mm_unpacklo_epi16(x, y);
+  __m128i xy1 = _mm_unpackhi_epi16(x, y);
+  __m128i zw0 = _mm_unpacklo_epi16(z, w);
+  __m128i zw1 = _mm_unpackhi_epi16(z, w);
+  // r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2 \/
+  // r4,R4,r6,R6,g4,G4,g6,G6,b4,B4,b6,B6,a4,A4,a6,A6
+  // ... r0,R0,r2,R2,r4,R4,r6,R6,g0,G0,g2,G2,g4,G4,g6,G6
+  // ... b0,B0,b2,B2,b4,B4,b6,B6,a0,A0,a2,A2,a4,A4,a6,A6
+  __m128i rg0 = _mm_unpacklo_epi32(xy0, zw0);
+  __m128i ba0 = _mm_unpackhi_epi32(xy0, zw0);
+  __m128i rg1 = _mm_unpacklo_epi32(xy1, zw1);
+  __m128i ba1 = _mm_unpackhi_epi32(xy1, zw1);
+
+  // Expand packed SoA pixels for each column. Multiply then add columns with
+  // 8-bit precision so we don't carry to high byte of word accidentally. Use
+  // final madd insn to blend interleaved rows and expand result to 32 bits.
+#  define FILTER_COMPONENT(out, unpack, src0, src1)                            \
+    {                                                                          \
+      __m128i cc0 = unpack(src0, _mm_setzero_si128());                         \
+      __m128i cc1 = unpack(src1, _mm_setzero_si128());                         \
+      cc0 = _mm_add_epi8(                                                      \
+          cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracx), \
+                              8));                                             \
+      out = _mm_cvtepi32_ps(_mm_madd_epi16(cc0, fracy));                       \
+    }
+  __m128 fr, fg, fb, fa;
+  FILTER_COMPONENT(fr, _mm_unpacklo_epi8, rg0, rg1);
+  FILTER_COMPONENT(fg, _mm_unpackhi_epi8, rg0, rg1);
+  FILTER_COMPONENT(fb, _mm_unpacklo_epi8, ba0, ba1);
+  FILTER_COMPONENT(fa, _mm_unpackhi_epi8, ba0, ba1);
+#  undef FILTER_COMPONENT
+
+  return vec4(fb, fg, fr, fa) * (1.0f / 0xFF00);
 #else
   ivec2 i(linearQuantize(P, 128, sampler));
   ivec2 frac = i & (I32)0x7F;
   i >>= 7;
 
   I32 row0 = clampCoord(i.x, sampler->width) +
              clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
   I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
@@ -2624,62 +2662,79 @@ template <typename S>
 vec4 textureLinearR8(S sampler, vec2 P, I32 zoffset = 0) {
   assert(sampler->format == TextureFormat::R8);
 
 #if USE_SSE2
   ivec2 i(linearQuantize(P, 256, sampler));
   ivec2 frac = i & (I32)0xFF;
   i >>= 8;
 
-  __m128i row0 = _mm_min_epi16(_mm_max_epi16(i.y, _mm_setzero_si128()),
-                               _mm_set1_epi32(sampler->height - 1));
-  row0 = _mm_madd_epi16(row0, _mm_set1_epi32(sampler->stride));
-  row0 =
-      _mm_add_epi32(row0, _mm_min_epi16(_mm_max_epi16(i.x, _mm_setzero_si128()),
-                                        _mm_set1_epi32(sampler->width - 1)));
+  // Pack coords so they get clamped into range, and also for later bounding
+  // of fractional coords. Store Y as low-bits for easier access, X as high.
+  __m128i yx = _mm_packs_epi32(i.y, i.x);
+  __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1),
+                               _mm_set1_epi32(sampler->width - 1));
+  // Clamp coords to valid range to prevent sampling outside texture.
+  __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw);
+  // Multiply clamped Y by stride and add X offset.
+  __m128i row0 = _mm_madd_epi16(
+      _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()),
+      _mm_set1_epi16(sampler->stride));
+  row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128()));
+  // Add in layer offset if available
   row0 = _mm_add_epi32(row0, zoffset);
 
-  __m128i yinside = _mm_andnot_si128(
-      _mm_cmplt_epi32(i.y, _mm_setzero_si128()),
-      _mm_cmplt_epi32(i.y, _mm_set1_epi32(sampler->height - 1)));
-  __m128i row1 = _mm_and_si128(yinside, _mm_set1_epi32(sampler->stride));
-
-  __m128i xinside = _mm_andnot_si128(
-      _mm_cmplt_epi32(i.x, _mm_setzero_si128()),
-      _mm_cmplt_epi32(i.x, _mm_set1_epi32(sampler->width - 1)));
-  __m128i fracx = _mm_and_si128(xinside, frac.x);
-  fracx = _mm_or_si128(_mm_slli_epi32(fracx, 16),
-                       _mm_sub_epi32(_mm_set1_epi32(256), fracx));
-
-  __m128i fracy = _mm_slli_epi16(frac.y, 4);
-  fracy = _mm_shufflelo_epi16(fracy, _MM_SHUFFLE(2, 2, 0, 0));
-  fracy = _mm_shufflehi_epi16(fracy, _MM_SHUFFLE(2, 2, 0, 0));
-
+  __m128i fracyx = _mm_packs_epi32(frac.y, frac.x);
+
+  // Check if coords were clamped at all above. If so, need to adjust fractions
+  // to avoid sampling outside the texture on the edges.
+  __m128i yxinside = _mm_andnot_si128(
+      _mm_cmplt_epi16(yx, _mm_setzero_si128()),
+      _mm_cmplt_epi16(yx, hw));
+  // Set fraction to zero when outside.
+  fracyx = _mm_and_si128(fracyx, yxinside);
+  // For X fraction, we need to store 1-fraction before each fraction, as a
+  // madd will be used to weight and collapse all results as last step.
+  __m128i fracx = _mm_unpackhi_epi16(
+      _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx);
+  // Store two side-by-side copies of Y fraction, as below each pixel value
+  // will be interleaved to be next to the pixel value for the next column.
+  __m128i fracy = _mm_unpacklo_epi16(fracyx, fracyx);
+
+  // Ensure we don't sample row off end of texture from added stride.
+  __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride));
+
+  // Calculate pointers for first row in each lane
   uint8_t* buf = (uint8_t*)sampler->buf;
   uint8_t* buf0 =
       buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(0, 0, 0, 0)));
   uint8_t* buf1 =
       buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(1, 1, 1, 1)));
   uint8_t* buf2 =
       buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(2, 2, 2, 2)));
   uint8_t* buf3 =
       buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 3, 3, 3)));
+  // Load adjacent columns from first row, pack into register, then expand.
   __m128i cc0 = _mm_unpacklo_epi8(
       _mm_setr_epi16(*(uint16_t*)buf0, *(uint16_t*)buf1, *(uint16_t*)buf2,
                      *(uint16_t*)buf3, 0, 0, 0, 0),
       _mm_setzero_si128());
+  // Load adjacent columns from next row, pack into register, then expand.
   __m128i cc1 = _mm_unpacklo_epi8(
       _mm_setr_epi16(*(uint16_t*)(buf0 + _mm_extract_epi16(row1, 0)),
-                     *(uint16_t*)(buf1 + _mm_extract_epi16(row1, 2)),
-                     *(uint16_t*)(buf2 + _mm_extract_epi16(row1, 4)),
-                     *(uint16_t*)(buf3 + _mm_extract_epi16(row1, 6)), 0, 0, 0,
-                     0),
+                     *(uint16_t*)(buf1 + _mm_extract_epi16(row1, 1)),
+                     *(uint16_t*)(buf2 + _mm_extract_epi16(row1, 2)),
+                     *(uint16_t*)(buf3 + _mm_extract_epi16(row1, 3)),
+                     0, 0, 0, 0),
       _mm_setzero_si128());
-  __m128i cc = _mm_add_epi16(
-      cc0, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(cc1, cc0), 4), fracy));
+  // Multiply then add rows with 8-bit precision so we don't carry to high byte
+  // of word accidentally. Use final madd insn to blend interleaved columns and
+  // expand result to 32 bits.
+  __m128i cc = _mm_add_epi8(
+      cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracy), 8));
   __m128 r = _mm_cvtepi32_ps(_mm_madd_epi16(cc, fracx));
   return vec4((Float)r * (1.0f / 0xFF00), 0.0f, 0.0f, 1.0f);
 #else
   ivec2 i(linearQuantize(P, 128, sampler));
   Float r = CONVERT(textureLinearPackedR8(sampler, i, zoffset), Float);
   return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
 #endif
 }