Bug 1387399: Add SIMD optimizations for methods commonly used in nsRect. r=jrmuizel draft
authorBas Schouten <bschouten@mozilla.com>
Tue, 08 Aug 2017 17:21:50 +0200
changeset 642606 be561c57d801bd8ea1fb4f5388a6d53002687670
parent 620691 7b9b3e4bfad01cac251a3586876c976e2c6e69d0
child 642607 b4e3c00bd920bc8845a23760b84a73d040c3cb04
push id72819
push userbschouten@mozilla.com
push dateTue, 08 Aug 2017 15:31:50 +0000
reviewersjrmuizel
bugs1387399
milestone57.0a1
Bug 1387399: Add SIMD optimizations for methods commonly used in nsRect. r=jrmuizel MozReview-Commit-ID: AMgHAFZ3FL3
gfx/2d/2D.h
gfx/2d/Factory.cpp
gfx/2d/Rect.h
gfx/src/nsRect.h
--- a/gfx/2d/2D.h
+++ b/gfx/2d/2D.h
@@ -1444,16 +1444,17 @@ struct Config {
 
 class GFX2D_API Factory
 {
 public:
   static void Init(const Config& aConfig);
   static void ShutDown();
 
   static bool HasSSE2();
+  static bool HasSSE4();
 
   /**
    * Returns false if any of the following are true:
    *
    *   - the width/height of |sz| are less than or equal to zero
    *   - the width/height of |sz| are greater than |limit|
    *   - the number of bytes that need to be allocated for the surface is too
    *     big to fit in an int32_t, or bigger than |allocLimit|, if specifed
--- a/gfx/2d/Factory.cpp
+++ b/gfx/2d/Factory.cpp
@@ -275,16 +275,39 @@ Factory::HasSSE2()
     sDetectionState = HasCPUIDBit(1u, edx, (1u<<26)) ? HAS_SSE2 : NO_SSE2;
   }
   return sDetectionState == HAS_SSE2;
 #else
   return false;
 #endif
 }
 
+bool
+Factory::HasSSE4()
+{
+#if defined(__SSE4__)
+  // gcc with -msse2 (default on OSX and x86-64)
+  // cl.exe with -arch:SSE2 (default on x64 compiler)
+  return true;
+#elif defined(HAVE_CPU_DETECTION) && defined(XP_WIN)
+  static enum {
+    UNINITIALIZED,
+    NO_SSE4,
+    HAS_SSE4
+  } sDetectionState = UNINITIALIZED;
+
+  if (sDetectionState == UNINITIALIZED) {
+    sDetectionState = HasCPUIDBit(1u, ecx, (1u << 19)) ? HAS_SSE4 : NO_SSE4;
+  }
+  return sDetectionState == HAS_SSE4;
+#else
+  return false;
+#endif
+}
+
 // If the size is "reasonable", we want gfxCriticalError to assert, so
 // this is the option set up for it.
 inline int LoggerOptionsBasedOnSize(const IntSize& aSize)
 {
   return CriticalLog::DefaultOptions(Factory::ReasonableSurfaceSize(aSize));
 }
 
 bool
--- a/gfx/2d/Rect.h
+++ b/gfx/2d/Rect.h
@@ -7,16 +7,17 @@
 #define MOZILLA_GFX_RECT_H_
 
 #include "BaseRect.h"
 #include "BaseMargin.h"
 #include "NumericTools.h"
 #include "Point.h"
 #include "Tools.h"
 #include "mozilla/Maybe.h"
+#include "smmintrin.h"
 
 #include <cmath>
 
 namespace mozilla {
 
 template <typename> struct IsPixel;
 
 namespace gfx {
--- a/gfx/src/nsRect.h
+++ b/gfx/src/nsRect.h
@@ -6,16 +6,17 @@
 
 #ifndef NSRECT_H
 #define NSRECT_H
 
 #include <stdio.h>                      // for FILE
 #include <stdint.h>                     // for int32_t, int64_t
 #include <algorithm>                    // for min/max
 #include "mozilla/Likely.h"             // for MOZ_UNLIKELY
+#include "mozilla/gfx/2D.h"             // for Factory
 #include "mozilla/gfx/Rect.h"
 #include "nsCoord.h"                    // for nscoord, etc
 #include "nsISupportsImpl.h"            // for MOZ_COUNT_CTOR, etc
 #include "nsPoint.h"                    // for nsIntPoint, nsPoint
 #include "nsMargin.h"                   // for nsIntMargin, nsMargin
 #include "nsSize.h"                     // for IntSize, nsSize
 #include "nscore.h"                     // for NS_BUILD_REFCNT_LOGGING
 
@@ -113,16 +114,73 @@ struct nsRect :
   MOZ_MUST_USE nsRect Union(const nsRect& aRect) const
   {
     return SaturatingUnion(aRect);
   }
   void UnionRect(const nsRect& aRect1, const nsRect& aRect2)
   {
     *this = aRect1.Union(aRect2);
   }
+
+#ifdef XP_WIN
+  MOZ_MUST_USE nsRect Intersect(const nsRect& aRect) const
+  {
+    nsRect result;
+    if (mozilla::gfx::Factory::HasSSE4()) {
+      __m128i rect1 = _mm_loadu_si128((__m128i*)&aRect); // x1, y1, w1, h1
+      __m128i rect2 = _mm_loadu_si128((__m128i*)this); // x2, y2, w2, h2
+
+      __m128i resultRect = _mm_max_epi32(rect1, rect2); // xr, yr, zz, zz
+
+
+      // result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+      // result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+      __m128i widthheight = _mm_min_epi32(_mm_add_epi32(_mm_sub_epi32(rect1, resultRect), _mm_srli_si128(rect1, 8)),
+                                          _mm_add_epi32(_mm_sub_epi32(rect2, resultRect), _mm_srli_si128(rect2, 8))); // w, h, zz, zz
+      widthheight = _mm_slli_si128(widthheight, 8); // 00, 00, wr, hr
+
+      resultRect = _mm_blend_epi16(resultRect, widthheight, 0xF0); // xr, yr, wr, hr
+
+      _mm_storeu_si128((__m128i*)&result, resultRect);
+
+      return result;
+    }
+
+    result.x = std::max<int32_t>(x, aRect.x);
+    result.y = std::max<int32_t>(y, aRect.y);
+    result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+    result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+    return result;
+  }
+
+  bool IntersectRect(const nsRect& aRect1, const nsRect& aRect2)
+  {
+    if (mozilla::gfx::Factory::HasSSE4()) {
+      __m128i rect1 = _mm_loadu_si128((__m128i*)&aRect1); // x1, y1, w1, h1
+      __m128i rect2 = _mm_loadu_si128((__m128i*)&aRect2); // x2, y2, w2, h2
+
+      __m128i resultRect = _mm_max_epi32(rect1, rect2); // xr, yr, zz, zz
+
+
+      // result.width = std::min<int32_t>(x - result.x + width, aRect.x - result.x + aRect.width);
+      // result.height = std::min<int32_t>(y - result.y + height, aRect.y - result.y + aRect.height);
+      __m128i widthheight = _mm_min_epi32(_mm_add_epi32(_mm_sub_epi32(rect1, resultRect), _mm_srli_si128(rect1, 8)),
+                                          _mm_add_epi32(_mm_sub_epi32(rect2, resultRect), _mm_srli_si128(rect2, 8))); // w, h, zz, zz
+      widthheight = _mm_slli_si128(widthheight, 8); // 00, 00, wr, hr
+
+      resultRect = _mm_blend_epi16(resultRect, widthheight, 0xF0); // xr, yr, wr, hr
+
+      _mm_storeu_si128((__m128i*)this, resultRect);
+
+      return (_mm_movemask_epi8(_mm_cmpgt_epi32(resultRect, _mm_setzero_si128())) & 0xFF00) == 0xFF00;
+    }
+    *static_cast<nsRect*>(this) = aRect1.Intersect(aRect2);
+    return !IsEmpty();
+  }
+#endif
 #endif
 
   void SaturatingUnionRect(const nsRect& aRect1, const nsRect& aRect2)
   {
     *this = aRect1.SaturatingUnion(aRect2);
   }
   void SaturatingUnionRectEdges(const nsRect& aRect1, const nsRect& aRect2)
   {
@@ -219,23 +277,47 @@ nsRect::ScaleToOtherAppUnitsRoundIn(int3
 }
 
 // scale the rect but round to preserve centers
 inline mozilla::gfx::IntRect
 nsRect::ScaleToNearestPixels(float aXScale, float aYScale,
                              nscoord aAppUnitsPerPixel) const
 {
   mozilla::gfx::IntRect rect;
-  rect.x = NSToIntRoundUp(NSAppUnitsToDoublePixels(x, aAppUnitsPerPixel) * aXScale);
-  rect.y = NSToIntRoundUp(NSAppUnitsToDoublePixels(y, aAppUnitsPerPixel) * aYScale);
-  // Avoid negative widths and heights due to overflow
-  rect.width  = std::max(0, NSToIntRoundUp(NSAppUnitsToDoublePixels(XMost(),
-                               aAppUnitsPerPixel) * aXScale) - rect.x);
-  rect.height = std::max(0, NSToIntRoundUp(NSAppUnitsToDoublePixels(YMost(),
-                               aAppUnitsPerPixel) * aYScale) - rect.y);
+
+  __m128 c1 = _mm_set_ps(aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel, aAppUnitsPerPixel);
+  __m128 c2 = _mm_set_ps(aYScale, aXScale, aYScale, aXScale);
+  __m128 c3 = _mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f);
+
+  // See Floor section.
+  _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+
+  __m128i recti = _mm_loadu_si128((__m128i*)this); // x, y, w, h
+  __m128i widthheight = _mm_slli_si128(recti, 8); // 0, 0, x, y
+
+  recti = _mm_add_epi32(recti, widthheight); // X, Y, XMost(), YMost()
+
+  __m128 rectf = _mm_cvtepi32_ps(recti);
+
+  // Scale
+  rectf = _mm_mul_ps(_mm_div_ps(rectf, c1), c2);
+
+  // Floor
+  // Executed with bias and roundmode down, since round-nearest rounds 0.5 downward.
+  rectf = _mm_add_ps(rectf, c3);
+
+  recti = _mm_cvtps_epi32(rectf); // r.x, r.y, r.XMost(), r.YMost()
+
+  widthheight = _mm_slli_si128(recti, 8); // 0, 0, r.x, r.y
+  recti = _mm_sub_epi32(recti, widthheight); // r.x, r.y, r.w, r.h
+
+  _mm_storeu_si128((__m128i*)&rect, recti);
+
+  _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+
   return rect;
 }
 
 // scale the rect but round to smallest containing rect
 inline mozilla::gfx::IntRect
 nsRect::ScaleToOutsidePixels(float aXScale, float aYScale,
                              nscoord aAppUnitsPerPixel) const
 {