Bug 1387399: Add SIMD optimizations for methods used frequently in nsRect. r=jrmuizel draft
authorBas Schouten <bschouten@mozilla.com>
Thu, 19 Apr 2018 15:56:29 +0200
changeset 786562 4b66a00f1b1b8ed9849ebf824a7ab8c2c526ad4a
parent 786536 d1479b21a2848f1ed2b73fb54202b5ba5e042ef3
push id107515
push userbschouten@mozilla.com
push dateMon, 23 Apr 2018 15:28:38 +0000
reviewersjrmuizel
bugs1387399
milestone61.0a1
Bug 1387399: Add SIMD optimizations for methods used frequently in nsRect. r=jrmuizel This improves the DisplayList Mutate Talos test by about 5% on windows, as well as numerous smaller improvements on DisplayList heavy tasks. MozReview-Commit-ID: tlEtPjqWI4
gfx/2d/2D.h
gfx/2d/BaseRect.h
gfx/2d/Factory.cpp
gfx/src/nsRect.h
layout/generic/WritingModes.h
--- a/gfx/2d/2D.h
+++ b/gfx/2d/2D.h
@@ -1548,16 +1548,17 @@ struct Config {
 class GFX2D_API Factory
 {
   using char_type = filesystem::Path::value_type;
 public:
   static void Init(const Config& aConfig);
   static void ShutDown();
 
   static bool HasSSE2();
+  static bool HasSSE4();
 
   /**
    * Returns false if any of the following are true:
    *
    *   - the width/height of |sz| are less than or equal to zero
    *   - the width/height of |sz| are greater than |limit|
    *   - the number of bytes that need to be allocated for the surface is too
    *     big to fit in an int32_t, or bigger than |allocLimit|, if specifed
--- a/gfx/2d/BaseRect.h
+++ b/gfx/2d/BaseRect.h
@@ -119,17 +119,17 @@ struct BaseRect {
   // of *this and aRect.
   MOZ_MUST_USE Sub Intersect(const Sub& aRect) const
   {
     Sub result;
     result.x = std::max<T>(x, aRect.x);
     result.y = std::max<T>(y, aRect.y);
     result.width = std::min<T>(x - result.x + width, aRect.x - result.x + aRect.width);
     result.height = std::min<T>(y - result.y + height, aRect.y - result.y + aRect.height);
-    if (result.width < 0 || result.height < 0) {
+    if (result.width <= 0 || result.height <= 0) {
       result.SizeTo(0, 0);
     }
     return result;
   }
   // Sets *this to be the rectangle containing the intersection of the points
   // (including edges) of *this and aRect. If there are no points in that
   // intersection, sets *this to be an empty rectangle with x/y set to the std::max
   // of the x/y of *this and aRect.
--- a/gfx/2d/Factory.cpp
+++ b/gfx/2d/Factory.cpp
@@ -82,17 +82,18 @@ GetGFX2DLog()
 
 // The following code was largely taken from xpcom/glue/SSE.cpp and
 // made a little simpler.
 enum CPUIDRegister { eax = 0, ebx = 1, ecx = 2, edx = 3 };
 
 #ifdef HAVE_CPUID_H
 
 #if !(defined(__SSE2__) || defined(_M_X64) || \
-     (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+     (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) || \
+    !defined(__SSE4__)
 // cpuid.h is available on gcc 4.3 and higher on i386 and x86_64
 #include <cpuid.h>
 
 static inline bool
 HasCPUIDBit(unsigned int level, CPUIDRegister reg, unsigned int bit)
 {
   unsigned int regs[4];
   return __get_cpuid(level, &regs[0], &regs[1], &regs[2], &regs[3]) &&
@@ -277,16 +278,39 @@ Factory::HasSSE2()
     sDetectionState = HasCPUIDBit(1u, edx, (1u<<26)) ? HAS_SSE2 : NO_SSE2;
   }
   return sDetectionState == HAS_SSE2;
 #else
   return false;
 #endif
 }
 
+bool
+Factory::HasSSE4()
+{
+#if defined(__SSE4__)
+  // gcc with -msse2 (default on OSX and x86-64)
+  // cl.exe with -arch:SSE2 (default on x64 compiler)
+  return true;
+#elif defined(HAVE_CPU_DETECTION)
+  static enum {
+    UNINITIALIZED,
+    NO_SSE4,
+    HAS_SSE4
+  } sDetectionState = UNINITIALIZED;
+
+  if (sDetectionState == UNINITIALIZED) {
+    sDetectionState = HasCPUIDBit(1u, ecx, (1u << 19)) ? HAS_SSE4 : NO_SSE4;
+  }
+  return sDetectionState == HAS_SSE4;
+#else
+  return false;
+#endif
+}
+
 // If the size is "reasonable", we want gfxCriticalError to assert, so
 // this is the option set up for it.
 inline int LoggerOptionsBasedOnSize(const IntSize& aSize)
 {
   return CriticalLog::DefaultOptions(Factory::ReasonableSurfaceSize(aSize));
 }
 
 bool
--- a/gfx/src/nsRect.h
+++ b/gfx/src/nsRect.h
@@ -8,22 +8,27 @@
 #ifndef NSRECT_H
 #define NSRECT_H
 
 #include <stdio.h>                      // for FILE
 #include <stdint.h>                     // for int32_t, int64_t
 #include <algorithm>                    // for min/max
 #include "mozilla/Likely.h"             // for MOZ_UNLIKELY
 #include "mozilla/gfx/Rect.h"
+#include "mozilla/gfx/2D.h"
+#include "mozilla/gfx/Logging.h"
 #include "nsCoord.h"                    // for nscoord, etc
 #include "nsISupportsImpl.h"            // for MOZ_COUNT_CTOR, etc
 #include "nsPoint.h"                    // for nsIntPoint, nsPoint
 #include "nsMargin.h"                   // for nsIntMargin, nsMargin
 #include "nsSize.h"                     // for IntSize, nsSize
 #include "nscore.h"                     // for NS_BUILD_REFCNT_LOGGING
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#include "smmintrin.h"
+#endif
 
 typedef mozilla::gfx::IntRect nsIntRect;
 
 struct nsRect :
   public mozilla::gfx::BaseRect<nscoord, nsRect, nsPoint, nsSize, nsMargin> {
   typedef mozilla::gfx::BaseRect<nscoord, nsRect, nsPoint, nsSize, nsMargin> Super;
 
   static void VERIFY_COORD(nscoord aValue) { ::VERIFY_COORD(aValue); }
@@ -115,16 +120,89 @@ struct nsRect :
   MOZ_MUST_USE nsRect UnsafeUnion(const nsRect& aRect) const
   {
     return Super::Union(aRect);
   }
   void UnionRect(const nsRect& aRect1, const nsRect& aRect2)
   {
     *this = aRect1.Union(aRect2);
   }
+
+#if defined(_MSC_VER) && !defined(__clang__)
+  // Only MSVC supports inlining intrinsics for archs you're not compiling for.
+  MOZ_MUST_USE nsRect Intersect(const nsRect& aRect) const
+  {
+    nsRect result;
+    if (mozilla::gfx::Factory::HasSSE4()) {
+      __m128i rect1 = _mm_loadu_si128((__m128i*)&aRect); // x1, y1, w1, h1
+      __m128i rect2 = _mm_loadu_si128((__m128i*)this); // x2, y2, w2, h2
+
+      __m128i resultRect = _mm_max_epi32(rect1, rect2); // xr, yr, zz, zz
+
+
+      // result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+      // result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+      __m128i widthheight = _mm_min_epi32(_mm_add_epi32(_mm_sub_epi32(rect1, resultRect), _mm_srli_si128(rect1, 8)),
+                                          _mm_add_epi32(_mm_sub_epi32(rect2, resultRect), _mm_srli_si128(rect2, 8))); // w, h, zz, zz
+      widthheight = _mm_slli_si128(widthheight, 8); // 00, 00, wr, hr
+
+      resultRect = _mm_blend_epi16(resultRect, widthheight, 0xF0); // xr, yr, wr, hr
+
+      if ((_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(resultRect, _mm_setzero_si128()))) & 0xC) != 0xC) {
+        // It's potentially more efficient to store all 0s. But the non SSE4 code leaves x/y intact
+        // so let's do the same here.
+        resultRect = _mm_and_si128(resultRect, _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+      }
+
+      _mm_storeu_si128((__m128i*)&result, resultRect);
+
+      return result;
+    }
+
+    result.x = std::max<int32_t>(x, aRect.x);
+    result.y = std::max<int32_t>(y, aRect.y);
+    result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+    result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+    if (result.width <= 0 || result.height <= 0) {
+      result.SizeTo(0, 0);
+    }
+    return result;
+  }
+
+  bool IntersectRect(const nsRect& aRect1, const nsRect& aRect2)
+  {
+    if (mozilla::gfx::Factory::HasSSE4()) {
+      __m128i rect1 = _mm_loadu_si128((__m128i*)&aRect1); // x1, y1, w1, h1
+      __m128i rect2 = _mm_loadu_si128((__m128i*)&aRect2); // x2, y2, w2, h2
+
+      __m128i resultRect = _mm_max_epi32(rect1, rect2); // xr, yr, zz, zz
+      // result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+      // result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+      __m128i widthheight = _mm_min_epi32(_mm_add_epi32(_mm_sub_epi32(rect1, resultRect), _mm_srli_si128(rect1, 8)),
+                                          _mm_add_epi32(_mm_sub_epi32(rect2, resultRect), _mm_srli_si128(rect2, 8))); // w, h, zz, zz
+      widthheight = _mm_slli_si128(widthheight, 8); // 00, 00, wr, hr
+
+      resultRect = _mm_blend_epi16(resultRect, widthheight, 0xF0); // xr, yr, wr, hr
+
+      if ((_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(resultRect, _mm_setzero_si128()))) & 0xC) != 0xC) {
+        // It's potentially more efficient to store all 0s. But the non SSE4 code leaves x/y intact
+        // so let's do the same here.
+        resultRect = _mm_and_si128(resultRect, _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+        _mm_storeu_si128((__m128i*)this, resultRect);
+        return false;
+      }
+
+      _mm_storeu_si128((__m128i*)this, resultRect);
+
+      return true;
+    }
+    *static_cast<nsRect*>(this) = aRect1.Intersect(aRect2);
+    return !IsEmpty();
+  }
+#endif
 #endif
 
   void SaturatingUnionRect(const nsRect& aRect1, const nsRect& aRect2)
   {
     *this = aRect1.SaturatingUnion(aRect2);
   }
   void SaturatingUnionRectEdges(const nsRect& aRect1, const nsRect& aRect2)
   {
@@ -209,47 +287,135 @@ nsRect::ScaleToOtherAppUnitsRoundIn(int3
   nsRect rect;
   rect.SetBox(NSToCoordCeil(NSCoordScale(x, aFromAPP, aToAPP)),
               NSToCoordCeil(NSCoordScale(y, aFromAPP, aToAPP)),
               NSToCoordFloor(NSCoordScale(XMost(), aFromAPP, aToAPP)),
               NSToCoordFloor(NSCoordScale(YMost(), aFromAPP, aToAPP)));
   return rect;
 }
 
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+// Life would be so much better if we had SSE4 here.
+static MOZ_ALWAYS_INLINE __m128i floor_ps2epi32(__m128 x)
+{
+  __m128 one = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+
+  __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
+  __m128 r = _mm_sub_ps(t, _mm_and_ps(_mm_cmplt_ps(x, t), one));
+
+  return _mm_cvttps_epi32(r);
+}
+
+static MOZ_ALWAYS_INLINE __m128i ceil_ps2epi32(__m128 x)
+{
+  __m128 t = _mm_sub_ps(_mm_setzero_ps(), x);
+  __m128i r = _mm_sub_epi32(_mm_setzero_si128(), floor_ps2epi32(t));
+
+  return r;
+}
+#endif
+
 // scale the rect but round to preserve centers
 inline mozilla::gfx::IntRect
 nsRect::ScaleToNearestPixels(float aXScale, float aYScale,
                              nscoord aAppUnitsPerPixel) const
 {
   mozilla::gfx::IntRect rect;
-  rect.SetNonEmptyBox(NSToIntRoundUp(NSAppUnitsToDoublePixels(x,
+  // ASAN builds appear not to respect changes to the SSE rounding mode.
+  // Android x86 builds have bindgen issues.
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+  __m128 appUnitsPacked = _mm_set_ps(aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel);
+  __m128 scalesPacked = _mm_set_ps(aYScale, aXScale, aYScale, aXScale);
+  __m128 biasesPacked = _mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f);
+
+  __m128i rectPacked = _mm_loadu_si128((__m128i*)this);
+  __m128i topLeft = _mm_slli_si128(rectPacked, 8);
+
+  rectPacked = _mm_add_epi32(rectPacked, topLeft); // X, Y, XMost(), YMost()
+
+  __m128 rectFloat = _mm_cvtepi32_ps(rectPacked);
+
+  // Scale, i.e. ([ x y xmost ymost ] / aAppUnitsPerPixel) * [ aXScale aYScale aXScale aYScale ]
+  rectFloat = _mm_mul_ps(_mm_div_ps(rectFloat, appUnitsPacked), scalesPacked);
+
+  // Floor
+  // Executed with bias and roundmode down, since round-nearest rounds 0.5 downward half the time.
+  rectFloat = _mm_add_ps(rectFloat, biasesPacked);
+  rectPacked = floor_ps2epi32(rectFloat);
+
+  topLeft = _mm_slli_si128(rectPacked, 8);
+  rectPacked = _mm_sub_epi32(rectPacked, topLeft); // X, Y, Width, Height
+
+  // Avoid negative width/height due to overflow.
+  __m128i mask = _mm_or_si128(_mm_cmpgt_epi32(rectPacked, _mm_setzero_si128()),
+                              _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+  // Mask will now contain [ 0xFFFFFFFF 0xFFFFFFFF (width <= 0 ? 0 : 0xFFFFFFFF) (height <= 0 ? 0 : 0xFFFFFFFF) ]
+  rectPacked = _mm_and_si128(rectPacked, mask);
+
+  _mm_storeu_si128((__m128i*)&rect, rectPacked);
+#else
+  rect.SetNonEmptyBox(NSToIntRoundUp(NSAppUnitsToFloatPixels(x,
                                      aAppUnitsPerPixel) * aXScale),
-                      NSToIntRoundUp(NSAppUnitsToDoublePixels(y,
+                      NSToIntRoundUp(NSAppUnitsToFloatPixels(y,
                                      aAppUnitsPerPixel) * aYScale),
-                      NSToIntRoundUp(NSAppUnitsToDoublePixels(XMost(),
+                      NSToIntRoundUp(NSAppUnitsToFloatPixels(XMost(),
                                      aAppUnitsPerPixel) * aXScale),
-                      NSToIntRoundUp(NSAppUnitsToDoublePixels(YMost(),
+                      NSToIntRoundUp(NSAppUnitsToFloatPixels(YMost(),
                                      aAppUnitsPerPixel) * aYScale));
+#endif
   return rect;
 }
 
 // scale the rect but round to smallest containing rect
 inline mozilla::gfx::IntRect
 nsRect::ScaleToOutsidePixels(float aXScale, float aYScale,
                              nscoord aAppUnitsPerPixel) const
 {
   mozilla::gfx::IntRect rect;
+  // ASAN builds appear not to respect changes to the SSE rounding mode.
+  // Android x86 builds have bindgen issues.
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+  __m128 appUnitsPacked = _mm_set_ps(aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel);
+  __m128 scalesPacked = _mm_set_ps(aYScale, aXScale, aYScale, aXScale);
+
+  __m128i rectPacked = _mm_loadu_si128((__m128i*)this); // x, y, w, h
+  __m128i topLeft = _mm_slli_si128(rectPacked, 8); // 0, 0, x, y
+
+  rectPacked = _mm_add_epi32(rectPacked, topLeft); // X, Y, XMost(), YMost()
+
+  __m128 rectFloat = _mm_cvtepi32_ps(rectPacked);
+
+  // Scale i.e. ([ x y xmost ymost ] / aAppUnitsPerPixel) * [ aXScale aYScale aXScale aYScale ]
+  rectFloat = _mm_mul_ps(_mm_div_ps(rectFloat, appUnitsPacked), scalesPacked);
+  rectPacked = ceil_ps2epi32(rectFloat); // xx, xx, XMost(), YMost()
+  __m128i tmp = floor_ps2epi32(rectFloat); // x, y, xx, xx
+
+  // _mm_move_sd is 1 cycle method of getting the blending we want.
+  rectPacked = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(rectPacked), _mm_castsi128_pd(tmp))); // x, y, XMost(), YMost()
+
+  topLeft = _mm_slli_si128(rectPacked, 8); // 0, 0, r.x, r.y
+  rectPacked = _mm_sub_epi32(rectPacked, topLeft); // r.x, r.y, r.w, r.h
+
+  // Avoid negative width/height due to overflow.
+  __m128i mask = _mm_or_si128(_mm_cmpgt_epi32(rectPacked, _mm_setzero_si128()),
+                              _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+  // Mask will now contain [ 0xFFFFFFFF 0xFFFFFFFF (width <= 0 ? 0 : 0xFFFFFFFF) (height <= 0 ? 0 : 0xFFFFFFFF) ]
+  rectPacked = _mm_and_si128(rectPacked, mask);
+
+  _mm_storeu_si128((__m128i*)&rect, rectPacked);
+#else
   rect.SetNonEmptyBox(NSToIntFloor(NSAppUnitsToFloatPixels(x,
                                    float(aAppUnitsPerPixel)) * aXScale),
                       NSToIntFloor(NSAppUnitsToFloatPixels(y,
                                    float(aAppUnitsPerPixel)) * aYScale),
                       NSToIntCeil(NSAppUnitsToFloatPixels(XMost(),
                                    float(aAppUnitsPerPixel)) * aXScale),
                       NSToIntCeil(NSAppUnitsToFloatPixels(YMost(),
                                    float(aAppUnitsPerPixel)) * aYScale));
+#endif
   return rect;
 }
 
 // scale the rect but round to largest contained rect
 inline mozilla::gfx::IntRect
 nsRect::ScaleToInsidePixels(float aXScale, float aYScale,
                             nscoord aAppUnitsPerPixel) const
 {
--- a/layout/generic/WritingModes.h
+++ b/layout/generic/WritingModes.h
@@ -1925,17 +1925,17 @@ public:
     mBStart = std::max(aRect1.mBStart, aRect2.mBStart);
     mBSize = bEnd - mBStart;
 
     if (mISize < 0 || mBSize < 0) {
       mISize = 0;
       mBSize = 0;
     }
 
-    MOZ_ASSERT(rectDebug.IsEqualEdges(nsRect(mIStart, mBStart, mISize, mBSize)));
+    MOZ_ASSERT((rectDebug.IsEmpty() && (mISize == 0 || mBSize == 0)) || rectDebug.IsEqualEdges(nsRect(mIStart, mBStart, mISize, mBSize)));
     return mISize > 0 && mBSize > 0;
   }
 
 private:
   LogicalRect() = delete;
 
 #ifdef DEBUG
   WritingMode GetWritingMode() const { return mWritingMode; }