Bug 1551916 - Add a boolean to every chunk for a long-line vector indicating whether that chunk contains any multiple-unit code points, so that column computations inside wholly-single-unit chunks can do a constant-time pointer-range computation... r=arai
authorJeff Walden <jwalden@mit.edu>
Fri, 17 May 2019 03:21:06 +0000
changeset 474288 9ae652e07ac2e3481d3cf8ce27a8f12d81d8e79c
parent 474287 70c4b663298ae5c4b287d020cc0842625fe6f52c
child 474289 44a79556ff86f3c3ead666fc2fd973e8b24681ea
push id113144
push usershindli@mozilla.com
push dateFri, 17 May 2019 16:44:55 +0000
treeherdermozilla-inbound@f4c4b796f845 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1551916
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1551916 - Add a boolean to every chunk for a long-line vector indicating whether that chunk contains any multiple-unit code points, so that column computations inside wholly-single-unit chunks can do a constant-time pointer-range computation... r=arai ...and avoid iterating at all. r=arai Depends on D31302 Differential Revision: https://phabricator.services.mozilla.com/D31303
js/src/frontend/TokenStream.cpp
js/src/frontend/TokenStream.h
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -755,61 +755,81 @@ uint32_t TokenStreamAnyChars::computePar
     lastChunkVectorForLine_ = nullptr;
     lastOffsetOfComputedColumn_ = start;
     lastComputedColumn_ = 0;
   }
 
   // Compute and return the final column number from a partial offset/column,
   // using the last-cached offset/column if they're more optimal.
   auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
-                                                        uint32_t partialCols) {
+                                                        uint32_t partialCols,
+                                                        UnitsType unitsType) {
     MOZ_ASSERT(partialOffset <= offset);
 
     // If the last lookup on this line was closer to |offset|, use it.
     if (partialOffset < this->lastOffsetOfComputedColumn_ &&
         this->lastOffsetOfComputedColumn_ <= offset) {
       partialOffset = this->lastOffsetOfComputedColumn_;
       partialCols = this->lastComputedColumn_;
     }
 
     const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
     const Unit* end = sourceUnits.codeUnitPtrAt(offset);
 
-    partialOffset += PointerRangeSize(begin, end);
-    partialCols += AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
+    size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
+    partialOffset += offsetDelta;
+
+    if (unitsType == UnitsType::GuaranteedSingleUnit) {
+      MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
+                 "guaranteed-single-units also guarantee pointer distance "
+                 "equals code point count");
+      partialCols += offsetDelta;
+    } else {
+      partialCols +=
+          AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
+    }
 
     this->lastOffsetOfComputedColumn_ = partialOffset;
     this->lastComputedColumn_ = partialCols;
     return partialCols;
   };
 
   const uint32_t offsetInLine = offset - start;
 
-  // The index within a relevant |Vector<uint32_t>| of the nearest chunk
-  // info...if it's been computed at all.
+  // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
   const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
-
-  // Compute the column from the start of the line if chunk information would
-  // direct us to the start of the line -- including if the line's too short to
-  // be chunked.
   if (chunkIndex == 0) {
-    return ColumnFromPartial(start, 0);
+    // We don't know from an |offset| in the zeroth chunk that this line is even
+    // long.  First-chunk info is mostly useless, anyway -- we have |start|
+    // already.  So if we have *easy* access to that zeroth chunk, use it --
+    // otherwise just count pessimally.  (This will still benefit from caching
+    // the last column/offset for computations for successive offsets, so it's
+    // not *always* worst-case.)
+    UnitsType unitsType;
+    if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
+      MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
+      unitsType = (*lastChunkVectorForLine_)[0].unitsType();
+    } else {
+      unitsType = UnitsType::PossiblyMultiUnit;
+    }
+
+    return ColumnFromPartial(start, 0, unitsType);
   }
 
   // If this line has no chunk vector yet, insert one in the hash map.  (The
   // required index is allocated and filled further down.)
   if (!lastChunkVectorForLine_) {
     auto ptr = longLineColumnInfo_.lookupForAdd(line);
     if (!ptr) {
       // This could rehash and invalidate a cached vector pointer, but the outer
       // condition means we don't have a cached pointer.
-      if (!longLineColumnInfo_.add(ptr, line, Vector<uint32_t>(cx))) {
+      if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
         // In case of OOM, just count columns from the start of the line.
         cx->recoverFromOutOfMemory();
-        return ColumnFromPartial(start, 0);
+        return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
       }
     }
 
     // Note that adding elements to this vector won't invalidate this pointer.
     lastChunkVectorForLine_ = &ptr->value();
   }
 
   const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
@@ -823,80 +843,117 @@ uint32_t TokenStreamAnyChars::computePar
     MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
 
     uint32_t naiveOffset = start + index * ColumnChunkLength;
     const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
 
     const Unit* actualPtr = naivePtr;
     RetractPointerToCodePointBoundary(&actualPtr, limit);
 
+#  ifdef DEBUG
+    if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
+        UnitsType::GuaranteedSingleUnit) {
+      MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
+    }
+#  endif
+
     return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
   };
 
   uint32_t partialOffset;
   uint32_t partialColumn;
+  UnitsType unitsType;
 
   auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
-  if (entriesLen <= chunkIndex) {
+  if (chunkIndex < entriesLen) {
+    // We've computed the chunk |offset| resides in.  Compute the column number
+    // from the chunk.
+    partialOffset = RetractedOffsetOfChunk(chunkIndex);
+    partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
+
+    // This is exact if |chunkIndex| isn't the last chunk.
+    unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
+
+    // Otherwise the last chunk is pessimistically assumed to contain multi-unit
+    // code points because we haven't fully examined its contents yet -- they
+    // may not have been tokenized yet, they could contain encoding errors, or
+    // they might not even exist.
+    MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
+                  (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
+                      UnitsType::PossiblyMultiUnit);
+  } else {
     // Extend the vector from its last entry or the start of the line.  (This is
     // also a suitable partial start point if we must recover from OOM.)
     if (entriesLen > 0) {
       partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
-      partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1];
+      partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
     } else {
       partialOffset = start;
       partialColumn = 0;
     }
 
     if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
       // As earlier, just start from the greatest offset/column in case of OOM.
       cx->recoverFromOutOfMemory();
-      return ColumnFromPartial(partialOffset, partialColumn);
+      return ColumnFromPartial(partialOffset, partialColumn,
+                               UnitsType::PossiblyMultiUnit);
     }
 
     // OOM is no longer possible now.  \o/
 
-    // The vector always begins with the column of the line start, i.e. zero.
+    // The vector always begins with the column of the line start, i.e. zero,
+    // with chunk units pessimally assumed not single-unit.
     if (entriesLen == 0) {
-      lastChunkVectorForLine_->infallibleAppend(0);
+      lastChunkVectorForLine_->infallibleAppend(
+          ChunkInfo(0, UnitsType::PossiblyMultiUnit));
       entriesLen++;
     }
 
     do {
       const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
       const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
-          start + std::min(entriesLen * ColumnChunkLength, offsetInLine));
+          start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
 
       MOZ_ASSERT(begin < chunkLimit);
       MOZ_ASSERT(chunkLimit <= limit);
 
       static_assert(ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength,
                     "chunk length in code units must be able to contain the "
                     "largest encoding of a code point, for retracting below to "
                     "never underflow");
 
       // Prior tokenizing ensured that [begin, limit) is validly encoded, and
       // |begin < chunkLimit|, so any retraction here can't underflow.
       RetractPointerToCodePointBoundary(&chunkLimit, limit);
 
       MOZ_ASSERT(begin < chunkLimit);
       MOZ_ASSERT(chunkLimit <= limit);
 
-      partialOffset += PointerRangeSize(begin, chunkLimit);
-      partialColumn += unicode::CountCodePoints(begin, chunkLimit);
-
-      lastChunkVectorForLine_->infallibleAppend(partialColumn);
-      entriesLen++;
+      size_t numUnits = PointerRangeSize(begin, chunkLimit);
+      size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
+
+      // If this chunk (which will become non-final at the end of the loop) is
+      // all single-unit code points, annotate the chunk accordingly.
+      if (numUnits == numCodePoints) {
+        lastChunkVectorForLine_->back().guaranteeSingleUnits();
+      }
+
+      partialOffset += numUnits;
+      partialColumn += numCodePoints;
+
+      lastChunkVectorForLine_->infallibleEmplaceBack(
+          partialColumn, UnitsType::PossiblyMultiUnit);
     } while (entriesLen < chunkIndex + 1);
-  } else {
-    partialOffset = RetractedOffsetOfChunk(chunkIndex);
-    partialColumn = (*lastChunkVectorForLine_)[chunkIndex];
+
+    // We're at a spot in the current final chunk, and final chunks never have
+    // complete units information, so be pessimistic.
+    unitsType = UnitsType::PossiblyMultiUnit;
   }
 
-  return ColumnFromPartial(partialOffset, partialColumn);
+  return ColumnFromPartial(partialOffset, partialColumn, unitsType);
 }
 
 #endif  // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
 
 template <typename Unit, class AnyCharsAccess>
 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
     LineToken lineToken, uint32_t offset) const {
   lineToken.assertConsistentOffset(offset);
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -749,26 +749,62 @@ class TokenStreamAnyChars : public Token
   const JS::ReadOnlyCompileOptions& options() const { return options_; }
 
   const char* getFilename() const { return filename_; }
 
  private:
 #if JS_COLUMN_DIMENSION_IS_CODE_POINTS()
   static constexpr uint32_t ColumnChunkLength = 128;
 
+  enum class UnitsType : unsigned char{
+      PossiblyMultiUnit = 0,
+      GuaranteedSingleUnit = 1,
+  };
+
+  class ChunkInfo {
+   public:
+    ChunkInfo(uint32_t col, UnitsType type)
+        : unitsType_(static_cast<unsigned char>(type)) {
+      memcpy(column_, &col, sizeof(col));
+    }
+
+    uint32_t column() const {
+      uint32_t col;
+      memcpy(&col, column_, sizeof(uint32_t));
+      return col;
+    }
+
+    UnitsType unitsType() const {
+      MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
+      return static_cast<UnitsType>(unitsType_);
+    }
+
+    void guaranteeSingleUnits() {
+      MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
+                 "should only be setting to possibly optimize from the "
+                 "pessimistic case");
+      unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
+    }
+
+   private:
+    // Store everything in |unsigned char|s so everything packs.
+    unsigned char column_[sizeof(uint32_t)];
+    unsigned char unitsType_;
+  };
+
   /**
    * Line number (of lines at least |ColumnChunkLength| code units long) to
    * a sequence of the column numbers at |ColumnChunkLength| boundaries rewound
    * (if needed) to the nearest code point boundary.
    *
    * Entries appear in this map only when a column computation of sufficient
    * distance is performed on a line, and the vectors are lazily filled as
    * greater offsets within lines require column computations.
    */
-  mutable HashMap<uint32_t, Vector<uint32_t>> longLineColumnInfo_;
+  mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
 #endif  // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
 
  protected:
   // Options used for parsing/tokenizing.
   const JS::ReadOnlyCompileOptions& options_;
 
   Token tokens[ntokens];  // circular token buffer
  private:
@@ -818,23 +854,23 @@ class TokenStreamAnyChars : public Token
   //
   // But usually we need columns for advancing offsets through scripts.  By
   // caching the last ((line number, offset) => relative column) mapping (in
   // similar manner to how |SourceUnits::lastIndex_| is used to cache
   // (offset => line number) mappings) we can usually avoid re-iterating through
   // the common line prefix.
   //
   // Additionally, we avoid hash table lookup costs by caching the
-  // |Vector<uint32_t>*| for the line of the last lookup.  (|nullptr| means we
+  // |Vector<ChunkInfo>*| for the line of the last lookup.  (|nullptr| means we
   // have to look it up -- or it hasn't been created yet.)  This pointer is
   // invalidated when a lookup on a new line occurs, but as it's not a pointer
   // at literal element data, it's *not* invalidated when new entries are added
   // to such a vector.
   mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
-  mutable Vector<uint32_t>* lastChunkVectorForLine_ = nullptr;
+  mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
   mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
   mutable uint32_t lastComputedColumn_ = 0;
 #endif  // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
 };
 
 constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }
 
 constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {