Bug 1387399: Add SIMD optimizations for methods used frequently in nsRect. r=jrmuizel
authorBas Schouten <bschouten@mozilla.com>
Thu, 19 Apr 2018 15:56:29 +0200
changeset 468843 16a7ab26c645d8201aba76ddeee9322514e69bef
parent 468842 b5051b2393f283d7b507f663f57afb1b9a8078d8
child 468844 8076e3b5771825ae8d3437a7665e8c8102657d0a
push id9165
push userasasaki@mozilla.com
push dateThu, 26 Apr 2018 21:04:54 +0000
treeherdermozilla-beta@064c3804de2e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs1387399
milestone61.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1387399: Add SIMD optimizations for methods used frequently in nsRect. r=jrmuizel This improves the DisplayList Mutate Talos test by about 5% on windows, as well as numerous smaller improvements on DisplayList heavy tasks. MozReview-Commit-ID: tlEtPjqWI4
gfx/2d/2D.h
gfx/2d/BaseRect.h
gfx/2d/Factory.cpp
gfx/src/nsRect.h
layout/generic/WritingModes.h
--- a/gfx/2d/2D.h
+++ b/gfx/2d/2D.h
@@ -1548,16 +1548,17 @@ struct Config {
 class GFX2D_API Factory
 {
   using char_type = filesystem::Path::value_type;
 public:
   static void Init(const Config& aConfig);
   static void ShutDown();
 
   static bool HasSSE2();
+  static bool HasSSE4();
 
   /**
    * Returns false if any of the following are true:
    *
    *   - the width/height of |sz| are less than or equal to zero
    *   - the width/height of |sz| are greater than |limit|
    *   - the number of bytes that need to be allocated for the surface is too
    *     big to fit in an int32_t, or bigger than |allocLimit|, if specifed
--- a/gfx/2d/BaseRect.h
+++ b/gfx/2d/BaseRect.h
@@ -119,17 +119,17 @@ struct BaseRect {
   // of *this and aRect.
   MOZ_MUST_USE Sub Intersect(const Sub& aRect) const
   {
     Sub result;
     result.x = std::max<T>(x, aRect.x);
     result.y = std::max<T>(y, aRect.y);
     result.width = std::min<T>(x - result.x + width, aRect.x - result.x + aRect.width);
     result.height = std::min<T>(y - result.y + height, aRect.y - result.y + aRect.height);
-    if (result.width < 0 || result.height < 0) {
+    if (result.width <= 0 || result.height <= 0) {
       result.SizeTo(0, 0);
     }
     return result;
   }
   // Sets *this to be the rectangle containing the intersection of the points
   // (including edges) of *this and aRect. If there are no points in that
   // intersection, sets *this to be an empty rectangle with x/y set to the std::max
   // of the x/y of *this and aRect.
--- a/gfx/2d/Factory.cpp
+++ b/gfx/2d/Factory.cpp
@@ -82,17 +82,18 @@ GetGFX2DLog()
 
 // The following code was largely taken from xpcom/glue/SSE.cpp and
 // made a little simpler.
 enum CPUIDRegister { eax = 0, ebx = 1, ecx = 2, edx = 3 };
 
 #ifdef HAVE_CPUID_H
 
 #if !(defined(__SSE2__) || defined(_M_X64) || \
-     (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+     (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) || \
+    !defined(__SSE4__)
 // cpuid.h is available on gcc 4.3 and higher on i386 and x86_64
 #include <cpuid.h>
 
 static inline bool
 HasCPUIDBit(unsigned int level, CPUIDRegister reg, unsigned int bit)
 {
   unsigned int regs[4];
   return __get_cpuid(level, &regs[0], &regs[1], &regs[2], &regs[3]) &&
@@ -277,16 +278,39 @@ Factory::HasSSE2()
     sDetectionState = HasCPUIDBit(1u, edx, (1u<<26)) ? HAS_SSE2 : NO_SSE2;
   }
   return sDetectionState == HAS_SSE2;
 #else
   return false;
 #endif
 }
 
+bool
+Factory::HasSSE4()
+{
+#if defined(__SSE4__)
+  // gcc with -msse2 (default on OSX and x86-64)
+  // cl.exe with -arch:SSE2 (default on x64 compiler)
+  return true;
+#elif defined(HAVE_CPU_DETECTION)
+  static enum {
+    UNINITIALIZED,
+    NO_SSE4,
+    HAS_SSE4
+  } sDetectionState = UNINITIALIZED;
+
+  if (sDetectionState == UNINITIALIZED) {
+    sDetectionState = HasCPUIDBit(1u, ecx, (1u << 19)) ? HAS_SSE4 : NO_SSE4;
+  }
+  return sDetectionState == HAS_SSE4;
+#else
+  return false;
+#endif
+}
+
 // If the size is "reasonable", we want gfxCriticalError to assert, so
 // this is the option set up for it.
 inline int LoggerOptionsBasedOnSize(const IntSize& aSize)
 {
   return CriticalLog::DefaultOptions(Factory::ReasonableSurfaceSize(aSize));
 }
 
 bool
--- a/gfx/src/nsRect.h
+++ b/gfx/src/nsRect.h
@@ -8,22 +8,27 @@
 #ifndef NSRECT_H
 #define NSRECT_H
 
 #include <stdio.h>                      // for FILE
 #include <stdint.h>                     // for int32_t, int64_t
 #include <algorithm>                    // for min/max
 #include "mozilla/Likely.h"             // for MOZ_UNLIKELY
 #include "mozilla/gfx/Rect.h"
+#include "mozilla/gfx/2D.h"
+#include "mozilla/gfx/Logging.h"
 #include "nsCoord.h"                    // for nscoord, etc
 #include "nsISupportsImpl.h"            // for MOZ_COUNT_CTOR, etc
 #include "nsPoint.h"                    // for nsIntPoint, nsPoint
 #include "nsMargin.h"                   // for nsIntMargin, nsMargin
 #include "nsSize.h"                     // for IntSize, nsSize
 #include "nscore.h"                     // for NS_BUILD_REFCNT_LOGGING
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#include "smmintrin.h"
+#endif
 
 typedef mozilla::gfx::IntRect nsIntRect;
 
 struct nsRect :
   public mozilla::gfx::BaseRect<nscoord, nsRect, nsPoint, nsSize, nsMargin> {
   typedef mozilla::gfx::BaseRect<nscoord, nsRect, nsPoint, nsSize, nsMargin> Super;
 
   static void VERIFY_COORD(nscoord aValue) { ::VERIFY_COORD(aValue); }
@@ -115,16 +120,89 @@ struct nsRect :
   MOZ_MUST_USE nsRect UnsafeUnion(const nsRect& aRect) const
   {
     return Super::Union(aRect);
   }
   void UnionRect(const nsRect& aRect1, const nsRect& aRect2)
   {
     *this = aRect1.Union(aRect2);
   }
+
+#if defined(_MSC_VER) && !defined(__clang__)
+  // Only MSVC supports inlining intrinsics for archs you're not compiling for.
+  MOZ_MUST_USE nsRect Intersect(const nsRect& aRect) const
+  {
+    nsRect result;
+    if (mozilla::gfx::Factory::HasSSE4()) {
+      __m128i rect1 = _mm_loadu_si128((__m128i*)&aRect); // x1, y1, w1, h1
+      __m128i rect2 = _mm_loadu_si128((__m128i*)this); // x2, y2, w2, h2
+
+      __m128i resultRect = _mm_max_epi32(rect1, rect2); // xr, yr, zz, zz
+
+
+      // result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+      // result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+      __m128i widthheight = _mm_min_epi32(_mm_add_epi32(_mm_sub_epi32(rect1, resultRect), _mm_srli_si128(rect1, 8)),
+                                          _mm_add_epi32(_mm_sub_epi32(rect2, resultRect), _mm_srli_si128(rect2, 8))); // w, h, zz, zz
+      widthheight = _mm_slli_si128(widthheight, 8); // 00, 00, wr, hr
+
+      resultRect = _mm_blend_epi16(resultRect, widthheight, 0xF0); // xr, yr, wr, hr
+
+      if ((_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(resultRect, _mm_setzero_si128()))) & 0xC) != 0xC) {
+        // It's potentially more efficient to store all 0s. But the non SSE4 code leaves x/y intact
+        // so let's do the same here.
+        resultRect = _mm_and_si128(resultRect, _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+      }
+
+      _mm_storeu_si128((__m128i*)&result, resultRect);
+
+      return result;
+    }
+
+    result.x = std::max<int32_t>(x, aRect.x);
+    result.y = std::max<int32_t>(y, aRect.y);
+    result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+    result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+    if (result.width <= 0 || result.height <= 0) {
+      result.SizeTo(0, 0);
+    }
+    return result;
+  }
+
+  bool IntersectRect(const nsRect& aRect1, const nsRect& aRect2)
+  {
+    if (mozilla::gfx::Factory::HasSSE4()) {
+      __m128i rect1 = _mm_loadu_si128((__m128i*)&aRect1); // x1, y1, w1, h1
+      __m128i rect2 = _mm_loadu_si128((__m128i*)&aRect2); // x2, y2, w2, h2
+
+      __m128i resultRect = _mm_max_epi32(rect1, rect2); // xr, yr, zz, zz
+      // result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+      // result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+      __m128i widthheight = _mm_min_epi32(_mm_add_epi32(_mm_sub_epi32(rect1, resultRect), _mm_srli_si128(rect1, 8)),
+                                          _mm_add_epi32(_mm_sub_epi32(rect2, resultRect), _mm_srli_si128(rect2, 8))); // w, h, zz, zz
+      widthheight = _mm_slli_si128(widthheight, 8); // 00, 00, wr, hr
+
+      resultRect = _mm_blend_epi16(resultRect, widthheight, 0xF0); // xr, yr, wr, hr
+
+      if ((_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(resultRect, _mm_setzero_si128()))) & 0xC) != 0xC) {
+        // It's potentially more efficient to store all 0s. But the non SSE4 code leaves x/y intact
+        // so let's do the same here.
+        resultRect = _mm_and_si128(resultRect, _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+        _mm_storeu_si128((__m128i*)this, resultRect);
+        return false;
+      }
+
+      _mm_storeu_si128((__m128i*)this, resultRect);
+
+      return true;
+    }
+    *static_cast<nsRect*>(this) = aRect1.Intersect(aRect2);
+    return !IsEmpty();
+  }
+#endif
 #endif
 
   void SaturatingUnionRect(const nsRect& aRect1, const nsRect& aRect2)
   {
     *this = aRect1.SaturatingUnion(aRect2);
   }
   void SaturatingUnionRectEdges(const nsRect& aRect1, const nsRect& aRect2)
   {
@@ -209,47 +287,133 @@ nsRect::ScaleToOtherAppUnitsRoundIn(int3
   nsRect rect;
   rect.SetBox(NSToCoordCeil(NSCoordScale(x, aFromAPP, aToAPP)),
               NSToCoordCeil(NSCoordScale(y, aFromAPP, aToAPP)),
               NSToCoordFloor(NSCoordScale(XMost(), aFromAPP, aToAPP)),
               NSToCoordFloor(NSCoordScale(YMost(), aFromAPP, aToAPP)));
   return rect;
 }
 
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+// Life would be so much better if we had SSE4 here.
+static MOZ_ALWAYS_INLINE __m128i floor_ps2epi32(__m128 x)
+{
+  __m128 one = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+
+  __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
+  __m128 r = _mm_sub_ps(t, _mm_and_ps(_mm_cmplt_ps(x, t), one));
+
+  return _mm_cvttps_epi32(r);
+}
+
+static MOZ_ALWAYS_INLINE __m128i ceil_ps2epi32(__m128 x)
+{
+  __m128 t = _mm_sub_ps(_mm_setzero_ps(), x);
+  __m128i r = _mm_sub_epi32(_mm_setzero_si128(), floor_ps2epi32(t));
+
+  return r;
+}
+#endif
+
 // scale the rect but round to preserve centers
 inline mozilla::gfx::IntRect
 nsRect::ScaleToNearestPixels(float aXScale, float aYScale,
                              nscoord aAppUnitsPerPixel) const
 {
   mozilla::gfx::IntRect rect;
-  rect.SetNonEmptyBox(NSToIntRoundUp(NSAppUnitsToDoublePixels(x,
+  // Android x86 builds have bindgen issues.
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+  __m128 appUnitsPacked = _mm_set_ps(aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel);
+  __m128 scalesPacked = _mm_set_ps(aYScale, aXScale, aYScale, aXScale);
+  __m128 biasesPacked = _mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f);
+
+  __m128i rectPacked = _mm_loadu_si128((__m128i*)this);
+  __m128i topLeft = _mm_slli_si128(rectPacked, 8);
+
+  rectPacked = _mm_add_epi32(rectPacked, topLeft); // X, Y, XMost(), YMost()
+
+  __m128 rectFloat = _mm_cvtepi32_ps(rectPacked);
+
+  // Scale, i.e. ([ x y xmost ymost ] / aAppUnitsPerPixel) * [ aXScale aYScale aXScale aYScale ]
+  rectFloat = _mm_mul_ps(_mm_div_ps(rectFloat, appUnitsPacked), scalesPacked);
+
+  // Floor
+  // Executed with bias and roundmode down, since round-nearest rounds 0.5 downward half the time.
+  rectFloat = _mm_add_ps(rectFloat, biasesPacked);
+  rectPacked = floor_ps2epi32(rectFloat);
+
+  topLeft = _mm_slli_si128(rectPacked, 8);
+  rectPacked = _mm_sub_epi32(rectPacked, topLeft); // X, Y, Width, Height
+
+  // Avoid negative width/height due to overflow.
+  __m128i mask = _mm_or_si128(_mm_cmpgt_epi32(rectPacked, _mm_setzero_si128()),
+                              _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+  // Mask will now contain [ 0xFFFFFFFF 0xFFFFFFFF (width <= 0 ? 0 : 0xFFFFFFFF) (height <= 0 ? 0 : 0xFFFFFFFF) ]
+  rectPacked = _mm_and_si128(rectPacked, mask);
+
+  _mm_storeu_si128((__m128i*)&rect, rectPacked);
+#else
+  rect.SetNonEmptyBox(NSToIntRoundUp(NSAppUnitsToFloatPixels(x,
                                      aAppUnitsPerPixel) * aXScale),
-                      NSToIntRoundUp(NSAppUnitsToDoublePixels(y,
+                      NSToIntRoundUp(NSAppUnitsToFloatPixels(y,
                                      aAppUnitsPerPixel) * aYScale),
-                      NSToIntRoundUp(NSAppUnitsToDoublePixels(XMost(),
+                      NSToIntRoundUp(NSAppUnitsToFloatPixels(XMost(),
                                      aAppUnitsPerPixel) * aXScale),
-                      NSToIntRoundUp(NSAppUnitsToDoublePixels(YMost(),
+                      NSToIntRoundUp(NSAppUnitsToFloatPixels(YMost(),
                                      aAppUnitsPerPixel) * aYScale));
+#endif
   return rect;
 }
 
 // scale the rect but round to smallest containing rect
 inline mozilla::gfx::IntRect
 nsRect::ScaleToOutsidePixels(float aXScale, float aYScale,
                              nscoord aAppUnitsPerPixel) const
 {
   mozilla::gfx::IntRect rect;
+  // Android x86 builds have bindgen issues.
+#if !defined(ANDROID) && (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+  __m128 appUnitsPacked = _mm_set_ps(aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel);
+  __m128 scalesPacked = _mm_set_ps(aYScale, aXScale, aYScale, aXScale);
+
+  __m128i rectPacked = _mm_loadu_si128((__m128i*)this); // x, y, w, h
+  __m128i topLeft = _mm_slli_si128(rectPacked, 8); // 0, 0, x, y
+
+  rectPacked = _mm_add_epi32(rectPacked, topLeft); // X, Y, XMost(), YMost()
+
+  __m128 rectFloat = _mm_cvtepi32_ps(rectPacked);
+
+  // Scale i.e. ([ x y xmost ymost ] / aAppUnitsPerPixel) * [ aXScale aYScale aXScale aYScale ]
+  rectFloat = _mm_mul_ps(_mm_div_ps(rectFloat, appUnitsPacked), scalesPacked);
+  rectPacked = ceil_ps2epi32(rectFloat); // xx, xx, XMost(), YMost()
+  __m128i tmp = floor_ps2epi32(rectFloat); // x, y, xx, xx
+
+  // _mm_move_sd is 1 cycle method of getting the blending we want.
+  rectPacked = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(rectPacked), _mm_castsi128_pd(tmp))); // x, y, XMost(), YMost()
+
+  topLeft = _mm_slli_si128(rectPacked, 8); // 0, 0, r.x, r.y
+  rectPacked = _mm_sub_epi32(rectPacked, topLeft); // r.x, r.y, r.w, r.h
+
+  // Avoid negative width/height due to overflow.
+  __m128i mask = _mm_or_si128(_mm_cmpgt_epi32(rectPacked, _mm_setzero_si128()),
+                              _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
+  // Mask will now contain [ 0xFFFFFFFF 0xFFFFFFFF (width <= 0 ? 0 : 0xFFFFFFFF) (height <= 0 ? 0 : 0xFFFFFFFF) ]
+  rectPacked = _mm_and_si128(rectPacked, mask);
+
+  _mm_storeu_si128((__m128i*)&rect, rectPacked);
+#else
   rect.SetNonEmptyBox(NSToIntFloor(NSAppUnitsToFloatPixels(x,
                                    float(aAppUnitsPerPixel)) * aXScale),
                       NSToIntFloor(NSAppUnitsToFloatPixels(y,
                                    float(aAppUnitsPerPixel)) * aYScale),
                       NSToIntCeil(NSAppUnitsToFloatPixels(XMost(),
                                    float(aAppUnitsPerPixel)) * aXScale),
                       NSToIntCeil(NSAppUnitsToFloatPixels(YMost(),
                                    float(aAppUnitsPerPixel)) * aYScale));
+#endif
   return rect;
 }
 
 // scale the rect but round to largest contained rect
 inline mozilla::gfx::IntRect
 nsRect::ScaleToInsidePixels(float aXScale, float aYScale,
                             nscoord aAppUnitsPerPixel) const
 {
--- a/layout/generic/WritingModes.h
+++ b/layout/generic/WritingModes.h
@@ -1925,17 +1925,17 @@ public:
     mBStart = std::max(aRect1.mBStart, aRect2.mBStart);
     mBSize = bEnd - mBStart;
 
     if (mISize < 0 || mBSize < 0) {
       mISize = 0;
       mBSize = 0;
     }
 
-    MOZ_ASSERT(rectDebug.IsEqualEdges(nsRect(mIStart, mBStart, mISize, mBSize)));
+    MOZ_ASSERT((rectDebug.IsEmpty() && (mISize == 0 || mBSize == 0)) || rectDebug.IsEqualEdges(nsRect(mIStart, mBStart, mISize, mBSize)));
     return mISize > 0 && mBSize > 0;
   }
 
 private:
   LogicalRect() = delete;
 
 #ifdef DEBUG
   WritingMode GetWritingMode() const { return mWritingMode; }