Bug 817058 - VMX acceleration for nsTextFragment. r=bzbarsky
authorCameron Kaiser <spectre@floodgap.com>
Wed, 28 Aug 2019 05:03:54 +0000
changeset 554075 d3fe24ea630981ac7b2fe46912a03aff2665e1b6
parent 554074 d920fb2cbec23e894932d1892fb166aefac39e78
child 554076 9a111ae683fae0100c145a333e8fdd95086a2ef1
push id2165
push userffxbld-merge
push dateMon, 14 Oct 2019 16:30:58 +0000
treeherdermozilla-release@0eae18af659f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbzbarsky
bugs817058
milestone70.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 817058 - VMX acceleration for nsTextFragment. r=bzbarsky Differential Revision: https://phabricator.services.mozilla.com/D43566
dom/base/moz.build
dom/base/nsTextFragment.cpp
dom/base/nsTextFragmentVMX.cpp
--- a/dom/base/moz.build
+++ b/dom/base/moz.build
@@ -463,16 +463,22 @@ SOURCES += [
 ]
 
 # Are we targeting x86-32 or x86-64?  If so, we want to include SSE2 code for
 # nsTextFragment.cpp
 if CONFIG['INTEL_ARCHITECTURE']:
     SOURCES += ['nsTextFragmentSSE2.cpp']
     SOURCES['nsTextFragmentSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
 
+# Are we targeting PowerPC? If so, we can enable a SIMD version for
+# nsTextFragment.cpp as well.
+if CONFIG['CPU_ARCH'].startswith('ppc'):
+    SOURCES += ['nsTextFragmentVMX.cpp']
+    SOURCES['nsTextFragmentVMX.cpp'].flags += CONFIG['PPC_VMX_FLAGS']
+
 EXTRA_JS_MODULES += [
     'ContentAreaDropListener.jsm',
     'DOMRequestHelper.jsm',
     'IndexedDBHelper.jsm',
     'ProcessSelector.jsm',
     'SlowScriptDebug.jsm',
 ]
 
--- a/dom/base/nsTextFragment.cpp
+++ b/dom/base/nsTextFragment.cpp
@@ -14,16 +14,17 @@
 #include "nsCRT.h"
 #include "nsReadableUtils.h"
 #include "nsMemory.h"
 #include "nsBidiUtils.h"
 #include "nsUnicharUtils.h"
 #include "mozilla/CheckedInt.h"
 #include "mozilla/MemoryReporting.h"
 #include "mozilla/SSE.h"
+#include "mozilla/ppc.h"
 #include "nsTextFragmentImpl.h"
 #include <algorithm>
 
 #define TEXTFRAG_WHITE_AFTER_NEWLINE 50
 #define TEXTFRAG_MAX_NEWLINES 7
 
 // Static buffer used for common fragments
 static char* sSpaceSharedString[TEXTFRAG_MAX_NEWLINES + 1];
@@ -161,28 +162,40 @@ static inline int32_t FirstNon8BitUnvect
 #ifdef MOZILLA_MAY_SUPPORT_SSE2
 namespace mozilla {
 namespace SSE2 {
 int32_t FirstNon8Bit(const char16_t* str, const char16_t* end);
 }  // namespace SSE2
 }  // namespace mozilla
 #endif
 
+#ifdef __powerpc__
+namespace mozilla {
+namespace VMX {
+int32_t FirstNon8Bit(const char16_t* str, const char16_t* end);
+}  // namespace VMX
+}  // namespace mozilla
+#endif
+
 /*
  * This function returns -1 if all characters in str are 8 bit characters.
  * Otherwise, it returns a value less than or equal to the index of the first
  * non-8bit character in str. For example, if first non-8bit character is at
  * position 25, it may return 25, or for example 24, or 16. But it guarantees
  * there is no non-8bit character before returned value.
  */
 static inline int32_t FirstNon8Bit(const char16_t* str, const char16_t* end) {
 #ifdef MOZILLA_MAY_SUPPORT_SSE2
   if (mozilla::supports_sse2()) {
     return mozilla::SSE2::FirstNon8Bit(str, end);
   }
+#elif defined(__powerpc__)
+  if (mozilla::supports_vmx()) {
+    return mozilla::VMX::FirstNon8Bit(str, end);
+  }
 #endif
 
   return FirstNon8BitUnvectorized(str, end);
 }
 
 bool nsTextFragment::SetTo(const char16_t* aBuffer, int32_t aLength,
                            bool aUpdateBidi, bool aForce2b) {
   if (aForce2b && mState.mIs2b && !m2b->IsReadonly()) {
new file mode 100644
--- /dev/null
+++ b/dom/base/nsTextFragmentVMX.cpp
@@ -0,0 +1,100 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// This file should only be compiled if you're on Power ISA.
+
+#include "nscore.h"
+#include "nsAlgorithm.h"
+#include "nsTextFragmentImpl.h"
+#include <altivec.h>
+
+namespace mozilla {
+namespace VMX {
+
+int32_t FirstNon8Bit(const char16_t* str, const char16_t* end) {
+  const uint32_t numUnicharsPerVector = 8;
+  const uint32_t numCharsPerVector = 16;
+  // Paranoia. If this assertion is wrong, change the vector loop below.
+  MOZ_ASSERT((numCharsPerVector / numUnicharsPerVector) == sizeof(char16_t));
+
+  typedef Non8BitParameters<sizeof(size_t)> p;
+  const uint32_t alignMask = p::alignMask();
+  const size_t mask = p::mask();
+  const uint32_t numUnicharsPerWord = p::numUnicharsPerWord();
+
+  const uint32_t len = end - str;
+
+  // i shall count the index in unichars; i2 shall count the index in chars.
+  uint32_t i = 0;
+  uint32_t i2 = 0;
+
+  // Align ourselves to a 16-byte boundary, as required by VMX loads.
+  uint32_t alignLen = std::min(
+      len, uint32_t(((-NS_PTR_TO_UINT32(str)) & 0xf) / sizeof(char16_t)));
+
+  if ((len - alignLen) >= numUnicharsPerVector) {
+    for (; i < alignLen; i++) {
+      if (str[i] > 255) return i;
+    }
+
+    // Construct a vector of shorts.
+#if __LITTLE_ENDIAN__
+    register const vector unsigned short gtcompare =
+        reinterpret_cast<vector unsigned short>(
+            vec_mergel(vec_splat_s8(-1), vec_splat_s8(0)));
+#else
+    register const vector unsigned short gtcompare =
+        reinterpret_cast<vector unsigned short>(
+            vec_mergel(vec_splat_s8(0), vec_splat_s8(-1)));
+#endif
+    const uint32_t vectWalkEnd =
+        ((len - i) / numUnicharsPerVector) * numUnicharsPerVector;
+    i2 = i * sizeof(char16_t);
+
+    while (1) {
+      register vector unsigned short vect;
+
+      // Check one VMX register (8 unichars) at a time. The vec_any_gt
+      // intrinsic does exactly what we want. This loop is manually unrolled;
+      // it yields notable performance improvements this way.
+#define CheckForASCII                                              \
+  vect = vec_ld(i2, reinterpret_cast<const unsigned short*>(str)); \
+  if (vec_any_gt(vect, gtcompare)) return i;                       \
+  i += numUnicharsPerVector;                                       \
+  if (!(i < vectWalkEnd)) break;                                   \
+  i2 += numCharsPerVector;
+
+      CheckForASCII CheckForASCII
+
+#undef CheckForASCII
+    }
+  } else {
+    // Align ourselves to a word boundary.
+    alignLen = std::min(len, uint32_t(((-NS_PTR_TO_UINT32(str)) & alignMask) /
+                                      sizeof(char16_t)));
+    for (; i < alignLen; i++) {
+      if (str[i] > 255) return i;
+    }
+  }
+
+  // Check one word at a time.
+  const uint32_t wordWalkEnd =
+      ((len - i) / numUnicharsPerWord) * numUnicharsPerWord;
+  for (; i < wordWalkEnd; i += numUnicharsPerWord) {
+    const size_t word = *reinterpret_cast<const size_t*>(str + i);
+    if (word & mask) return i;
+  }
+
+  // Take care of the remainder one character at a time.
+  for (; i < len; i++) {
+    if (str[i] > 255) {
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+}  // namespace VMX
+}  // namespace mozilla